Last active
August 25, 2020 16:14
-
-
Save abaykan/8196ae250afb7b141f56aa81274fcfbf to your computer and use it in GitHub Desktop.
Web Scraping with BeautifulSoup
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# Codelatte Team | |
# https://codelatte.org/ | |
# Usage: python3 scrapurl.py https://yoursite.com/ | |
import sys | |
import requests | |
from urllib.request import urlparse, urljoin | |
from bs4 import BeautifulSoup | |
import colorama | |
colorama.init() | |
GREEN = colorama.Fore.GREEN | |
RESET = colorama.Fore.RESET | |
internal_urls = set() | |
external_urls = set() | |
def is_valid(url): | |
parsed = urlparse(url) | |
return bool(parsed.netloc) and bool(parsed.scheme) | |
def get_all_website_links(url): | |
urls = set() | |
domain_name = urlparse(url).netloc | |
soup = BeautifulSoup(requests.get(url).content, "html.parser") | |
for a_tag in soup.findAll("a"): | |
href = a_tag.attrs.get("href") | |
if href == "" or href is None: | |
continue | |
href = urljoin(url, href) | |
parsed_href = urlparse(href) | |
if not is_valid(href): | |
continue | |
if href in internal_urls: | |
continue | |
if domain_name not in href: | |
if href not in external_urls: | |
print(f"{RESET}[!] External link: {href}") | |
external_urls.add(href) | |
continue | |
print(f"{GREEN}[*] Internal link: {href}{RESET}") | |
urls.add(href) | |
internal_urls.add(href) | |
return urls | |
total_urls_visited = 0 | |
def crawl(url, max_urls=50): | |
global total_urls_visited | |
total_urls_visited += 1 | |
links = get_all_website_links(url) | |
for link in links: | |
if total_urls_visited > max_urls: | |
break | |
crawl(link, max_urls=max_urls) | |
if __name__ == "__main__": | |
crawl(sys.argv[1]) | |
print("[+] Total External links:", len(external_urls)) | |
print("[+] Total Internal links:", len(internal_urls)) | |
print("[+] Total:", len(external_urls) + len(internal_urls)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment