Skip to content

Instantly share code, notes, and snippets.

@sgraaf
Created September 15, 2022 12:54
Show Gist options
  • Save sgraaf/9df4a2b2aa3fb7c3ee67626f07d12359 to your computer and use it in GitHub Desktop.
Save sgraaf/9df4a2b2aa3fb7c3ee67626f07d12359 to your computer and use it in GitHub Desktop.
Scrape recent Chrome and Firefox version numbers
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Get Chrome versions (Stable, Desktop)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"\n",
"from requests import Session\n",
"from lxml import html"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preamble"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"HTML_PATTERN = re.compile(r\"<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});\")\n",
"VERSION_PATTERN = re.compile(r\"([\\d.]{3,})\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"session = Session()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"next_url = \"https://chromereleases.googleblog.com/search/label/Desktop%20Update\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Main"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"with open(\"chrome_versions.txt\", \"r\", encoding=\"utf-8\") as fh:\n",
" versions = fh.read().splitlines()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for version in versions:\n",
" print(f\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/{version} Safari/537.36\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(versions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"len(versions)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_versions = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"while next_url:\n",
" r = session.get(next_url)\n",
"\n",
" tree = html.fromstring(r.content)\n",
"\n",
" try:\n",
" next_url = tree.xpath('//a[@id=\"Blog1_blog-pager-older-link\"]').pop().get(\"href\")\n",
" except IndexError:\n",
" break\n",
"\n",
" for blog_post in tree.xpath('//div[@id=\"Blog1\"]/div[@class=\"post\"]'):\n",
" title = blog_post.find(\"h2\").text_content().strip()\n",
" if title == \"Stable Channel Update for Desktop\":\n",
" body = re.sub(HTML_PATTERN, \"\", blog_post.find('div[@class=\"post-body\"]').text_content()).strip()\n",
" version = re.search(VERSION_PATTERN, body).group()\n",
" if version in versions:\n",
" break\n",
" new_versions.append(version)\n",
" else:\n",
" continue\n",
" \n",
" break"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"new_versions"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"if new_versions:\n",
" with open(\"chrome_versions.txt\", \"a\", encoding=\"utf-8\") as fh:\n",
" for version in sorted(new_versions):\n",
" fh.write(version + \"\\n\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3.9.13 64-bit",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "ee427b6b47775788dd76e73cf8e9aba462fa2dd30b2c8940bf43e4b56c5081dc"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment