Skip to content

Instantly share code, notes, and snippets.

@abaykan
Last active August 25, 2020 16:14
Show Gist options
  • Save abaykan/8196ae250afb7b141f56aa81274fcfbf to your computer and use it in GitHub Desktop.
Save abaykan/8196ae250afb7b141f56aa81274fcfbf to your computer and use it in GitHub Desktop.
Web Scraping with BeautifulSoup
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Codelatte Team
# https://codelatte.org/
# Usage: python3 scrapurl.py https://yoursite.com/
import sys
import requests
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup
import colorama
colorama.init()
GREEN = colorama.Fore.GREEN
RESET = colorama.Fore.RESET
internal_urls = set()
external_urls = set()
def is_valid(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def get_all_website_links(url):
urls = set()
domain_name = urlparse(url).netloc
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a_tag in soup.findAll("a"):
href = a_tag.attrs.get("href")
if href == "" or href is None:
continue
href = urljoin(url, href)
parsed_href = urlparse(href)
if not is_valid(href):
continue
if href in internal_urls:
continue
if domain_name not in href:
if href not in external_urls:
print(f"{RESET}[!] External link: {href}")
external_urls.add(href)
continue
print(f"{GREEN}[*] Internal link: {href}{RESET}")
urls.add(href)
internal_urls.add(href)
return urls
total_urls_visited = 0
def crawl(url, max_urls=50):
global total_urls_visited
total_urls_visited += 1
links = get_all_website_links(url)
for link in links:
if total_urls_visited > max_urls:
break
crawl(link, max_urls=max_urls)
if __name__ == "__main__":
crawl(sys.argv[1])
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment