Skip to content

Instantly share code, notes, and snippets.

@hhimanshu
Last active October 21, 2024 19:06
Show Gist options
  • Save hhimanshu/e34ded35f27a2ad41c9f9fe9c3005829 to your computer and use it in GitHub Desktop.
Save hhimanshu/e34ded35f27a2ad41c9f9fe9c3005829 to your computer and use it in GitHub Desktop.
Strip `JS` and `style` tags from html
#!/usr/bin/env python
# coding: utf-8
# In[ ]:
get_ipython().run_line_magic('pip', 'install -q beautifulsoup4')
# In[ ]:
import os
import glob
import csv
import time
from bs4 import BeautifulSoup
def read_html_file(file_path):
"""Read the contents of an HTML file."""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def clean_html(html_content):
"""Remove script and style tags from HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style tags
for script in soup(["script", "style"]):
script.decompose()
return str(soup)
def write_cleaned_html(input_file_path, output_directory, cleaned_content):
"""Write the cleaned HTML content to a new file in the output directory."""
base_name = os.path.basename(input_file_path)
cleaned_file_path = os.path.join(output_directory, base_name)
with open(cleaned_file_path, 'w', encoding='utf-8') as file:
file.write(cleaned_content)
return cleaned_file_path
def process_html_files(input_directory, output_directory):
"""Process all HTML files in the input directory and save to the output directory."""
if not os.path.exists(output_directory):
os.makedirs(output_directory)
html_files = glob.glob(os.path.join(input_directory, "*.html"))
results = []
for file_path in html_files:
start_time = time.time()
original_size = os.path.getsize(file_path)
html_content = read_html_file(file_path)
cleaned_content = clean_html(html_content)
cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
cleaned_size = os.path.getsize(cleaned_file_path)
end_time = time.time()
reduction_percentage = ((original_size - cleaned_size) / original_size) * 100
results.append({
'file_name': os.path.basename(file_path),
'input_size': original_size / (1024 * 1024), # Convert to MB
'output_size': cleaned_size / (1024 * 1024), # Convert to MB
'processing_time': (end_time - start_time) * 1000, # Convert to ms
'reduction_percentage': reduction_percentage
})
return results
def create_csv_report(results, output_directory):
"""Create a CSV report with the processing results."""
report_path = os.path.join(output_directory, 'processing_report.csv')
with open(report_path, 'w', newline='') as csvfile:
fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in results:
writer.writerow({
'file_name': result['file_name'],
'input_size': f"{result['input_size']:.2f}",
'output_size': f"{result['output_size']:.2f}",
'processing_time': f"{result['processing_time']:.2f}",
'reduction_percentage': f"{result['reduction_percentage']:.2f}"
})
return report_path
# Usage
input_directory = '/Users/harit/Downloads/working/amz/input'
output_directory = '/Users/harit/Downloads/working/amz/output'
# Process the HTML files
results = process_html_files(input_directory, output_directory)
# Create CSV report
report_path = create_csv_report(results, output_directory)
print(f"CSV report created: {report_path}")
# Print CSV content
with open(report_path, 'r') as csvfile:
print(csvfile.read())
# In[ ]:
import os
import glob
import csv
import time
import re
from bs4 import BeautifulSoup, Comment
from urllib.parse import urlparse, urljoin
def read_html_file(file_path):
"""Read the contents of an HTML file."""
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def clean_and_optimize_html(html_content, base_url):
"""Clean and optimize HTML content."""
soup = BeautifulSoup(html_content, 'html.parser')
# Remove script and style tags
for script in soup(["script", "style"]):
script.decompose()
# Minification: remove comments
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
comment.extract()
# Attribute optimization
for tag in soup.find_all():
# Remove default attributes
if tag.name == 'script' and tag.get('type') == 'text/javascript':
del tag['type']
if tag.name in ['div', 'span'] and tag.get('style') == '':
del tag['style']
# URL shortening
for tag in soup.find_all(['a', 'img', 'link', 'script']):
if tag.has_attr('href'):
tag['href'] = shorten_url(tag['href'], base_url)
if tag.has_attr('src'):
tag['src'] = shorten_url(tag['src'], base_url)
# Entity encoding optimization
html_string = str(soup)
html_string = html_string.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
# Remove empty elements
html_string = re.sub(r'<([a-z]+)>\s*</\1>', '', html_string, flags=re.IGNORECASE)
# Final minification: remove unnecessary whitespace
html_string = re.sub(r'\s+', ' ', html_string)
html_string = re.sub(r'>\s+<', '><', html_string)
return html_string
def shorten_url(url, base_url):
"""Convert absolute URLs to relative when possible."""
parsed_url = urlparse(url)
parsed_base = urlparse(base_url)
if parsed_url.netloc == parsed_base.netloc:
return urljoin('/', parsed_url.path)
return url
def write_cleaned_html(input_file_path, output_directory, cleaned_content):
"""Write the cleaned HTML content to a new file in the output directory."""
base_name = os.path.basename(input_file_path)
cleaned_file_path = os.path.join(output_directory, base_name)
with open(cleaned_file_path, 'w', encoding='utf-8') as file:
file.write(cleaned_content)
return cleaned_file_path
def process_html_files(input_directory, output_directory, base_url):
"""Process all HTML files in the input directory and save to the output directory."""
if not os.path.exists(output_directory):
os.makedirs(output_directory)
html_files = glob.glob(os.path.join(input_directory, "*.html"))
results = []
for file_path in html_files:
start_time = time.time()
original_size = os.path.getsize(file_path)
html_content = read_html_file(file_path)
cleaned_content = clean_and_optimize_html(html_content, base_url)
cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
cleaned_size = os.path.getsize(cleaned_file_path)
end_time = time.time()
reduction_percentage = ((original_size - cleaned_size) / original_size) * 100
results.append({
'file_name': os.path.basename(file_path),
'input_size': original_size / (1024 * 1024), # Convert to MB
'output_size': cleaned_size / (1024 * 1024), # Convert to MB
'processing_time': (end_time - start_time) * 1000, # Convert to ms
'reduction_percentage': reduction_percentage
})
return results
def create_csv_report(results, output_directory):
"""Create a CSV report with the processing results."""
report_path = os.path.join(output_directory, 'processing_report.csv')
with open(report_path, 'w', newline='') as csvfile:
fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for result in results:
writer.writerow({
'file_name': result['file_name'],
'input_size': f"{result['input_size']:.2f}",
'output_size': f"{result['output_size']:.2f}",
'processing_time': f"{result['processing_time']:.2f}",
'reduction_percentage': f"{result['reduction_percentage']:.2f}"
})
return report_path
# Usage
input_directory = '/Users/harit/Downloads/working/amz/input'
output_directory = '/Users/harit/Downloads/working/amz/output'
base_url = 'https://www.amazon.com' # Adjust this to the actual base URL of your Amazon pages
# Process the HTML files
results = process_html_files(input_directory, output_directory, base_url)
# Create CSV report
report_path = create_csv_report(results, output_directory)
print(f"CSV report created: {report_path}")
# Print CSV content
with open(report_path, 'r') as csvfile:
print(csvfile.read())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment