Skip to content

Instantly share code, notes, and snippets.

@adrianhorning08
Created September 30, 2024 18:03
Show Gist options
  • Save adrianhorning08/aa7ca71a0935a9e59ead986ce26a1afd to your computer and use it in GitHub Desktop.
Save adrianhorning08/aa7ca71a0935a9e59ead986ce26a1afd to your computer and use it in GitHub Desktop.
Scraping Chrome Extensions
import fetch from "node-fetch";
import * as cheerio from "cheerio";
import fs from "graceful-fs";
import { gotScraping } from "got-scraping";
import {
getAllItemsFromSupabaseTable,
getProxyAgent,
getProxyUrl,
getSmartProxyUrl,
supabase,
} from "./utils.js";
function getJSON(html) {
let gotIt = false;
let endIndex = 100;
// 87 worked once
while (!gotIt && endIndex > 0) {
try {
const data = html.substring(11, html.length - endIndex);
const json = JSON.parse(data);
gotIt = true;
return json;
} catch (error) {
// console.log("error", endIndex, error.message);
endIndex--;
}
}
}
function prepareLookup() {
// this function takes a list of indexes as arguments
// constructs them into a line of code and then
// execs the retrieval in a try/catch to handle data not being present
return function lookup(...indexes) {
const indexesWithBrackets = indexes.reduce(
(acc, cur) => `${acc}[${cur}]`,
""
);
const cmd = `data${indexesWithBrackets}`;
try {
const result = eval(cmd);
return result;
} catch (e) {
return null;
}
};
}
function jsonifyListings(arr, type = "category") {
return arr.map((listing) => {
let item = listing[0];
if (type === "search") {
item = listing[0][0];
}
const id = item[0];
// const avatar = item[1];
const name = item[2];
const rating = item[3];
const rating_count = item[4];
// const image = item[5];
const description = item[6];
const website = item[7];
return {
id,
url: `https://chrome.google.com/webstore/detail/${id}?hl=en&gl=US&authuser=0`,
// avatar,
// image,
name,
rating,
rating_count,
description,
website,
};
});
}
async function getSecondPageChromeExtensionListings(
category,
token,
retries = 0
) {
try {
const res = await fetch(
`https://chromewebstore.google.com/_/ChromeWebStoreConsumerFeUi/data/batchexecute?rpcids=zTyKYc&source-path=%2Fcategory%2Fextensions%2Flifestyle%2Fentertainment&f.sid=1835170610361360018&bl=boq_chrome-webstore-consumerfe-ui_20240110.06_p0&hl=en-GB&soc-app=1&soc-platform=1&soc-device=1&_reqid=51442959&rt=c`,
{
agent: getProxyAgent(),
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.216 Safari/537.36",
accept: "*/*",
"accept-language": "en-US,en-CA;q=0.9,en-AU;q=0.8,en;q=0.7",
"content-type": "application/x-www-form-urlencoded;charset=UTF-8",
"sec-ch-ua":
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-full-version": '"120.0.6099.216"',
"sec-ch-ua-full-version-list":
'"Not_A Brand";v="8.0.0.0", "Chromium";v="120.0.6099.216", "Google Chrome";v="120.0.6099.216"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"13.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-client-data":
"CJC2yQEIorbJAQipncoBCPP3ygEIk6HLAQiGoM0BCLvIzQEIjuHNAQip6c0BCIPwzQEIhfDNAQix8c0BCKjyzQEIgvTNARj2yc0BGKfqzQEY642lFw==",
"x-same-domain": "1",
Referer: "https://chromewebstore.google.com/",
"Referrer-Policy": "origin",
},
body: `f.req=%5B%5B%5B%22zTyKYc%22%2C%22%5B%5Bnull%2C%5B%5B3%2C%5C%22${encodeURIComponent(
category
)}%5C%22%2Cnull%2Cnull%2C2%2C%5B32%2C%5C%22${token}%3D%5C%22%5D%5D%5D%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D&at=AI9LxBERjt7goBH29vAdqXebPVgb%3A1705341357475&`,
method: "POST",
}
);
console.log("getSecondPageChromeExtensionListings res.status", res.status);
if (res.status !== 200) {
throw new Error("403");
}
const html = await res.text();
const json = getJSON(html);
const preparedData = JSON.parse(json[0][2]);
const nextToken = preparedData?.[2]?.[0];
console.log("nextToken", nextToken);
const arrWithAllTheData = preparedData[0][0][0][13][0][0];
const jsonifiedListings = jsonifyListings(arrWithAllTheData);
return {
listings: jsonifiedListings,
nextToken,
};
} catch (error) {
if (retries < 10) {
return getSecondPageChromeExtensionListings(category, token, retries + 1);
}
console.log("error at getSecondPageChromeExtensionListings", error.message);
return {
listings: [],
nextToken: null,
};
}
}
function getEmailOnThePage(html) {
// use regex to find emails on the page
const regex = /([a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)/gi;
const matches = html.match(regex);
// remove duplicates
const uniqueMatches = [...new Set(matches)];
return uniqueMatches?.[0];
}
async function getExtensionPage(url, retries = 0) {
try {
const res = await gotScraping({
url,
// proxyUrl: getProxyUrl(),
proxyUrl: getSmartProxyUrl(),
timeout: {
request: 10000,
},
retry: {
limit: 0,
},
});
if (res.statusCode !== 200) {
throw new Error("403");
}
const html = res.body;
return {
email: getEmailOnThePage(html),
url,
};
} catch (error) {
if (retries < 3) {
return getExtensionPage(url, retries + 1);
}
console.log("error at getExtensionPage", error.message);
}
}
async function getExtensionSearchFirstPage(category) {
try {
const res = await gotScraping({
url: `https://chromewebstore.google.com/category/extensions/${category}?hl=en-GB`,
proxyUrl: getProxyUrl(),
});
const html = res.body;
console.log("getExtensionSearchFirstPage res.statusCode", res.statusCode);
const $ = cheerio.load(html);
const scriptTags = $("script");
const scriptTagWithJson = scriptTags.filter((i, el) => {
const text = $(el).html();
return text.includes("lh3.googleusercontent");
});
let parsed = scriptTagWithJson.html().split("data:")[1];
parsed = parsed.split("sideChannel: {}")[0];
// take off last character
parsed = parsed.slice(0, parsed.length - 2);
const json = eval(parsed);
let nextToken = json[2][0];
// nextToken = nextToken.split("=")[0];
const arrWithAllTheData = json[0][0][0][13][0][0];
const jsonifiedListings = jsonifyListings(arrWithAllTheData);
return {
listings: jsonifiedListings,
nextToken,
};
} catch (error) {
console.log("error at getExtensionSearchFirstPage", error.message);
}
}
async function getAllChromeExtensionListings(category) {
const allListings = [];
let hasNextPage = true;
let nextToken = null;
const extensionRes = await getExtensionSearchFirstPage(category);
console.log("extensionRes.listings.length", extensionRes.listings.length);
allListings.push(...extensionRes.listings);
console.log("extensionRes.nextToken", extensionRes.nextToken);
nextToken = extensionRes.nextToken;
while (hasNextPage) {
const secondPage = await getSecondPageChromeExtensionListings(
category,
nextToken
);
console.log("secondPage.listings.length", secondPage.listings.length);
allListings.push(...secondPage.listings);
console.log("secondPage.nextToken", secondPage.nextToken);
nextToken = secondPage.nextToken;
if (!nextToken) {
hasNextPage = false;
}
}
return allListings.map((listing) => {
return {
...listing,
category,
};
});
}
async function searchExtensions(query, token, retries = 0) {
try {
let body = null;
if (token) {
body = `f.req=%5B%5B%5B%22zTyKYc%22%2C%22%5B%5Bnull%2C%5Bnull%2Cnull%2Cnull%2C%5B%5C%22${query}%5C%22%2C%5B150%2C%5C%22${encodeURIComponent(
token
)}%5C%22%5D%5D%5D%5D%5D%22%2Cnull%2C%22generic%22%5D%5D%5D&at=AI9LxBEkJm-0AYRS7g2Fpkbu5yqj%3A1705362687679&`;
} else {
body = `f.req=%5B%5B%5B%22zTyKYc%22%2C%22%5B%5Bnull%2C%5Bnull%2Cnull%2Cnull%2C%5B%5C%22${query}%5C%22%2C%5B150%5D%5D%5D%5D%5D%22%2Cnull%2C%221%22%5D%5D%5D&at=AI9LxBEkJm-0AYRS7g2Fpkbu5yqj%3A1705362687679&`;
}
const res = await fetch(
"https://chromewebstore.google.com/_/ChromeWebStoreConsumerFeUi/data/batchexecute?rpcids=zTyKYc&source-path=%2Fsearch%2Fa&f.sid=7423966586146055674&bl=boq_chrome-webstore-consumerfe-ui_20240110.06_p0&hl=en-GB&soc-app=1&soc-platform=1&soc-device=1&_reqid=6864292&rt=c",
{
agent: getProxyAgent(),
headers: {
"User-Agent":
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_0_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.216 Safari/537.36",
accept: "*/*",
"accept-language": "en-US,en-CA;q=0.9,en-AU;q=0.8,en;q=0.7",
"content-type": "application/x-www-form-urlencoded;charset=UTF-8",
"sec-ch-ua":
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-full-version": '"120.0.6099.216"',
"sec-ch-ua-full-version-list":
'"Not_A Brand";v="8.0.0.0", "Chromium";v="120.0.6099.216", "Google Chrome";v="120.0.6099.216"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"13.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-client-data":
"CJC2yQEIorbJAQipncoBCPP3ygEIk6HLAQiGoM0BCLvIzQEIjuHNAQip6c0BCIPwzQEIhfDNAQix8c0BCKjyzQEIgvTNARj2yc0BGKfqzQEY642lFw==",
"x-same-domain": "1",
Referer: "https://chromewebstore.google.com/",
"Referrer-Policy": "origin",
},
body,
method: "POST",
}
);
console.log("searchExtensions res.status", res.status);
const html = await res.text();
fs.writeFileSync("test.html", html);
const json = getJSON(html);
const preparedData = JSON.parse(json[0][2]);
const nextToken = preparedData?.[2]?.[0];
console.log("nextToken", nextToken);
const arrWithAllTheData = preparedData[0][0][0][5][0][0];
const jsonifiedListings = jsonifyListings(arrWithAllTheData, "search");
return {
listings: jsonifiedListings,
nextToken,
};
} catch (error) {
if (retries < 3) {
return searchExtensions(query, token, retries + 1);
}
console.log("error at searchExtensions", error.message);
return {
listings: [],
nextToken: null,
};
}
}
async function getAllChromeExtensionListingsFromQuery(query) {
const allListings = [];
let hasNextPage = true;
let nextToken = null;
const extensionRes = await searchExtensions(query);
console.log("extensionRes.listings.length", extensionRes.listings.length);
allListings.push(...extensionRes.listings);
console.log("extensionRes.nextToken", extensionRes.nextToken);
nextToken = extensionRes.nextToken;
while (hasNextPage) {
const secondPage = await searchExtensions(query, nextToken);
console.log("secondPage.listings.length", secondPage.listings.length);
allListings.push(...secondPage.listings);
console.log("secondPage.nextToken", secondPage.nextToken);
nextToken = secondPage.nextToken;
if (!nextToken) {
hasNextPage = false;
}
}
return allListings;
}
async function getExtensionInfo(id) {
try {
const res = await fetch(
"https://chromewebstore.google.com/_/ChromeWebStoreConsumerFeUi/data/batchexecute?rpcids=xY2Ddd%2CnwZOzf%2Cx1DgCd&source-path=%2Fdetail%2Fad-library-ad-finder-adsp%2Fppbmlcfgohokdanfpeoanjcdclffjncg%2Freviews&f.sid=-8720772806426539589&bl=boq_chrome-webstore-consumerfe-ui_20240110.06_p0&hl=en&soc-app=1&soc-platform=1&soc-device=1&_reqid=1350824&rt=c",
{
headers: {
accept: "*/*",
"accept-language": "en-US,en-CA;q=0.9,en-AU;q=0.8,en;q=0.7",
"content-type": "application/x-www-form-urlencoded;charset=UTF-8",
"sec-ch-ua":
'"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
"sec-ch-ua-arch": '"arm"',
"sec-ch-ua-bitness": '"64"',
"sec-ch-ua-full-version": '"120.0.6099.216"',
"sec-ch-ua-full-version-list":
'"Not_A Brand";v="8.0.0.0", "Chromium";v="120.0.6099.216", "Google Chrome";v="120.0.6099.216"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-model": '""',
"sec-ch-ua-platform": '"macOS"',
"sec-ch-ua-platform-version": '"13.0.1"',
"sec-ch-ua-wow64": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"x-client-data":
"CJC2yQEIorbJAQipncoBCPP3ygEIk6HLAQiGoM0BCLvIzQEIjuHNAQip6c0BCIPwzQEIhfDNAQix8c0BCKjyzQEIgvTNARj2yc0BGKfqzQEY642lFw==",
"x-same-domain": "1",
Referer: "https://chromewebstore.google.com/",
"Referrer-Policy": "origin",
},
body: "f.req=%5B%5B%5B%22xY2Ddd%22%2C%22%5B%5C%22ppbmlcfgohokdanfpeoanjcdclffjncg%5C%22%5D%22%2Cnull%2C%221%22%5D%2C%5B%22nwZOzf%22%2C%22%5B%5D%22%2Cnull%2C%225%22%5D%2C%5B%22x1DgCd%22%2C%22%5B%5C%22ppbmlcfgohokdanfpeoanjcdclffjncg%5C%22%2C%5B10%5D%2C2%5D%22%2Cnull%2C%228%22%5D%5D%5D&at=AI9LxBFL6EfcYKQ83QsTFSRfT86b%3A1705435623174&",
method: "POST",
}
);
} catch (error) {
console.log("error at getExtensionInfo", error.message);
}
}
async function getAllChromeExtensionListingsByQuery() {
const queries = [
"business",
"software",
"marketing",
"sales",
"productivity",
"social",
"seo",
"analytics",
"design",
"development",
"finance",
"education",
"communication",
"customer service",
"project management",
"human resources",
"operations",
"it",
"legal",
"real estate",
"healthcare",
"retail",
"hospitality",
"transportation",
"construction",
"manufacturing",
"media",
"entertainment",
"nonprofit",
"government",
"other",
"advertising",
"agriculture",
"automotive",
"biotechnology",
"clean energy",
"clean technology",
"clean",
"consumer electronics",
"consumer goods",
"consumer services",
"cosmetics",
"e-commerce",
"fashion",
"food and beverage",
"gaming",
"hardware",
"health and wellness",
"information technology",
"internet",
"internet of things",
"lifestyle",
"logistics",
"luxury goods",
"machine learning",
"market research",
"mobile",
"nanotechnology",
"networking",
"oil and gas",
"online marketplace",
"pharmaceuticals",
"professional services",
"renewables and environment",
"retail technology",
"saas",
"semiconductor",
"social media",
"software engineering",
"sports",
"technology",
"telecommunications",
"transport",
"travel",
"utilities",
"virtual reality",
"wearables",
"web development",
"wellness and fitness",
"wireless",
"a",
"b",
"c",
"d",
"e",
"f",
"g",
"h",
"i",
"j",
"k",
"l",
"m",
"n",
"o",
"p",
"q",
"r",
"s",
"t",
"u",
"v",
"x",
"y",
"z",
];
for (const query of queries) {
console.log("query", query);
const allListings = await getAllChromeExtensionListingsFromQuery(query);
fs.writeFileSync("test.json", JSON.stringify(allListings, null, 2));
const unique = [];
const uniqueIds = new Set();
allListings.forEach((listing) => {
if (!uniqueIds.has(listing.id)) {
unique.push(listing);
uniqueIds.add(listing.id);
}
});
console.log("allListings.length", allListings.length);
const { error } = await supabase.from("chrome_extensions").upsert(unique);
if (error) {
console.log("error inserring", error);
}
}
}
async function getAllChromeExtensionListingsByCategory() {
const categories = [
"productivity/communication",
"productivity/developer",
"productivity/education",
"productivity/tools",
"productivity/workflow",
"lifestyle/art",
"lifestyle/entertainment",
"lifestyle/games",
"lifestyle/household",
"lifestyle/fun",
"lifestyle/news",
"lifestyle/shopping",
"lifestyle/social",
"lifestyle/travel",
"lifestyle/well_being",
];
for (const category of categories) {
console.log("category", category);
const allListings = await getAllChromeExtensionListings(category);
console.log("allListings.length", allListings.length);
const { error } = await supabase
.from("chrome_extensions")
.upsert(allListings);
if (error) {
console.log("error inserring", error);
}
}
}
async function runEmailStuff(extension) {
const emailObj = await getExtensionPage(extension?.url);
console.log("emailObj", emailObj);
const { error } = await supabase
.from("chrome_extensions")
.update({ email: emailObj?.email })
.eq("url", extension.url);
if (error) {
console.log("error updating email", error);
}
}
(async () => {
await getAllChromeExtensionListingsByCategory();
// await getAllChromeExtensionListingsByQuery();
// get emails
const allExtensions = await getAllItemsFromSupabaseTable("chrome_extensions");
let batch = [];
for (let i = 0; i < allExtensions.length; i++) {
console.log(`processing ${i} of ${allExtensions.length}`);
const extension = allExtensions[i];
batch.push(runEmailStuff(extension));
if (batch.length === 10 || i === allExtensions.length - 1) {
await Promise.all(batch);
batch = [];
}
}
console.log("done");
})();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment