Created
December 7, 2020 22:17
-
-
Save clinejj/a95685eba991e79df596e3cfaaf802d5 to your computer and use it in GitHub Desktop.
Emojipedia Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* This is helpful if you want to download Emoji's for...Slack? Whatever you need. | |
* Requires cheerio and node-fetch to be available. | |
* | |
* Things to update: | |
* - The scrapeURLs for which Emoji versions you want to download | |
* - The imageURL used below is the first in the vendor list, which happens to be Apple. If | |
* you want a different vendor, select something different in the index. | |
*/ | |
const cheerio = require('cheerio'); | |
const fetch = require('node-fetch'); | |
const {createWriteStream} = require('fs'); | |
const fs = require('fs'); | |
const {pipeline} = require('stream'); | |
const {promisify} = require('util'); | |
const streamPipeline = promisify(pipeline); | |
const host = 'https://emojipedia.org' | |
const scrapeURLs = [ | |
'https://emojipedia.org/emoji-11.0/', | |
'https://emojipedia.org/emoji-12.0/', | |
'https://emojipedia.org/emoji-12.1/', | |
'https://emojipedia.org/emoji-13.0/' | |
] | |
const userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'; | |
const fetchOptions = { headers: { 'User-Agent': userAgent } }; | |
const downloadEmoji = async (url, prefix, name) => { | |
let resp; | |
try { | |
resp = await fetch(url, fetchOptions); | |
} catch (error) { | |
return; | |
} | |
const body = await resp.text(); | |
const $ = cheerio.load(body); | |
let imageUrl = $('.vendor-list .vendor-image img').attr('srcset'); | |
if (!imageUrl) return; | |
imageUrl = imageUrl.split(' ')[0]; | |
const fileExt = imageUrl.substring(imageUrl.length - 4, imageUrl.length); | |
const imageResp = await fetch(imageUrl); | |
if (!imageResp.ok) { | |
console.log(`Could not download ${prefix}/${name}`); | |
return; | |
} | |
const filename = `./${prefix}/${name}${fileExt}`; | |
console.log(`Downloading ${filename}`); | |
await streamPipeline(imageResp.body, createWriteStream(filename)); | |
} | |
const processEmojiPage = (body, prefix) => { | |
const $ = cheerio.load(body); | |
const $emojiList = $('.content article ul').children(); | |
$emojiList.each(async (_index, element) => { | |
$element = $(element); | |
const path = $element.children('a').attr('href'); | |
if (!path) return; | |
const name = path.replace(/\//gi, ''); | |
await downloadEmoji(host + path, prefix, name); | |
}); | |
} | |
const processUrls = () => { | |
scrapeURLs.forEach(async (url, _index) => { | |
const resp = await fetch(url, fetchOptions); | |
const body = await resp.text(); | |
const prefix = url.split('/')[3]; | |
if (!fs.existsSync(`./${prefix}`)){ | |
fs.mkdirSync(`./${prefix}`); | |
} | |
processEmojiPage(body, prefix); | |
}); | |
} | |
processUrls(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment