Skip to content

Instantly share code, notes, and snippets.

@clinejj
Created December 7, 2020 22:17
Show Gist options
  • Save clinejj/a95685eba991e79df596e3cfaaf802d5 to your computer and use it in GitHub Desktop.
Save clinejj/a95685eba991e79df596e3cfaaf802d5 to your computer and use it in GitHub Desktop.
Emojipedia Scraper
/* This is helpful if you want to download Emoji's for...Slack? Whatever you need.
* Requires cheerio and node-fetch to be available.
*
* Things to update:
* - The scrapeURLs for which Emoji versions you want to download
* - The imageURL used below is the first in the vendor list, which happens to be Apple. If
* you want a different vendor, select something different in the index.
*/
const cheerio = require('cheerio');
const fetch = require('node-fetch');
const {createWriteStream} = require('fs');
const fs = require('fs');
const {pipeline} = require('stream');
const {promisify} = require('util');
const streamPipeline = promisify(pipeline);
const host = 'https://emojipedia.org'
const scrapeURLs = [
'https://emojipedia.org/emoji-11.0/',
'https://emojipedia.org/emoji-12.0/',
'https://emojipedia.org/emoji-12.1/',
'https://emojipedia.org/emoji-13.0/'
]
const userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36';
const fetchOptions = { headers: { 'User-Agent': userAgent } };
const downloadEmoji = async (url, prefix, name) => {
let resp;
try {
resp = await fetch(url, fetchOptions);
} catch (error) {
return;
}
const body = await resp.text();
const $ = cheerio.load(body);
let imageUrl = $('.vendor-list .vendor-image img').attr('srcset');
if (!imageUrl) return;
imageUrl = imageUrl.split(' ')[0];
const fileExt = imageUrl.substring(imageUrl.length - 4, imageUrl.length);
const imageResp = await fetch(imageUrl);
if (!imageResp.ok) {
console.log(`Could not download ${prefix}/${name}`);
return;
}
const filename = `./${prefix}/${name}${fileExt}`;
console.log(`Downloading ${filename}`);
await streamPipeline(imageResp.body, createWriteStream(filename));
}
const processEmojiPage = (body, prefix) => {
const $ = cheerio.load(body);
const $emojiList = $('.content article ul').children();
$emojiList.each(async (_index, element) => {
$element = $(element);
const path = $element.children('a').attr('href');
if (!path) return;
const name = path.replace(/\//gi, '');
await downloadEmoji(host + path, prefix, name);
});
}
const processUrls = () => {
scrapeURLs.forEach(async (url, _index) => {
const resp = await fetch(url, fetchOptions);
const body = await resp.text();
const prefix = url.split('/')[3];
if (!fs.existsSync(`./${prefix}`)){
fs.mkdirSync(`./${prefix}`);
}
processEmojiPage(body, prefix);
});
}
processUrls();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment