Last active
October 21, 2024 19:06
-
-
Save hhimanshu/e34ded35f27a2ad41c9f9fe9c3005829 to your computer and use it in GitHub Desktop.
Strip `JS` and `style` tags from html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# In[ ]: | |
get_ipython().run_line_magic('pip', 'install -q beautifulsoup4') | |
# In[ ]: | |
import os | |
import glob | |
import csv | |
import time | |
from bs4 import BeautifulSoup | |
def read_html_file(file_path): | |
"""Read the contents of an HTML file.""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
def clean_html(html_content): | |
"""Remove script and style tags from HTML content.""" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Remove script and style tags | |
for script in soup(["script", "style"]): | |
script.decompose() | |
return str(soup) | |
def write_cleaned_html(input_file_path, output_directory, cleaned_content): | |
"""Write the cleaned HTML content to a new file in the output directory.""" | |
base_name = os.path.basename(input_file_path) | |
cleaned_file_path = os.path.join(output_directory, base_name) | |
with open(cleaned_file_path, 'w', encoding='utf-8') as file: | |
file.write(cleaned_content) | |
return cleaned_file_path | |
def process_html_files(input_directory, output_directory): | |
"""Process all HTML files in the input directory and save to the output directory.""" | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
html_files = glob.glob(os.path.join(input_directory, "*.html")) | |
results = [] | |
for file_path in html_files: | |
start_time = time.time() | |
original_size = os.path.getsize(file_path) | |
html_content = read_html_file(file_path) | |
cleaned_content = clean_html(html_content) | |
cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content) | |
cleaned_size = os.path.getsize(cleaned_file_path) | |
end_time = time.time() | |
reduction_percentage = ((original_size - cleaned_size) / original_size) * 100 | |
results.append({ | |
'file_name': os.path.basename(file_path), | |
'input_size': original_size / (1024 * 1024), # Convert to MB | |
'output_size': cleaned_size / (1024 * 1024), # Convert to MB | |
'processing_time': (end_time - start_time) * 1000, # Convert to ms | |
'reduction_percentage': reduction_percentage | |
}) | |
return results | |
def create_csv_report(results, output_directory): | |
"""Create a CSV report with the processing results.""" | |
report_path = os.path.join(output_directory, 'processing_report.csv') | |
with open(report_path, 'w', newline='') as csvfile: | |
fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for result in results: | |
writer.writerow({ | |
'file_name': result['file_name'], | |
'input_size': f"{result['input_size']:.2f}", | |
'output_size': f"{result['output_size']:.2f}", | |
'processing_time': f"{result['processing_time']:.2f}", | |
'reduction_percentage': f"{result['reduction_percentage']:.2f}" | |
}) | |
return report_path | |
# Usage | |
input_directory = '/Users/harit/Downloads/working/amz/input' | |
output_directory = '/Users/harit/Downloads/working/amz/output' | |
# Process the HTML files | |
results = process_html_files(input_directory, output_directory) | |
# Create CSV report | |
report_path = create_csv_report(results, output_directory) | |
print(f"CSV report created: {report_path}") | |
# Print CSV content | |
with open(report_path, 'r') as csvfile: | |
print(csvfile.read()) | |
# In[ ]: | |
import os | |
import glob | |
import csv | |
import time | |
import re | |
from bs4 import BeautifulSoup, Comment | |
from urllib.parse import urlparse, urljoin | |
def read_html_file(file_path): | |
"""Read the contents of an HTML file.""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
def clean_and_optimize_html(html_content, base_url): | |
"""Clean and optimize HTML content.""" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
# Remove script and style tags | |
for script in soup(["script", "style"]): | |
script.decompose() | |
# Minification: remove comments | |
for comment in soup.find_all(text=lambda text: isinstance(text, Comment)): | |
comment.extract() | |
# Attribute optimization | |
for tag in soup.find_all(): | |
# Remove default attributes | |
if tag.name == 'script' and tag.get('type') == 'text/javascript': | |
del tag['type'] | |
if tag.name in ['div', 'span'] and tag.get('style') == '': | |
del tag['style'] | |
# URL shortening | |
for tag in soup.find_all(['a', 'img', 'link', 'script']): | |
if tag.has_attr('href'): | |
tag['href'] = shorten_url(tag['href'], base_url) | |
if tag.has_attr('src'): | |
tag['src'] = shorten_url(tag['src'], base_url) | |
# Entity encoding optimization | |
html_string = str(soup) | |
html_string = html_string.replace('&', '&').replace('<', '<').replace('>', '>') | |
# Remove empty elements | |
html_string = re.sub(r'<([a-z]+)>\s*</\1>', '', html_string, flags=re.IGNORECASE) | |
# Final minification: remove unnecessary whitespace | |
html_string = re.sub(r'\s+', ' ', html_string) | |
html_string = re.sub(r'>\s+<', '><', html_string) | |
return html_string | |
def shorten_url(url, base_url): | |
"""Convert absolute URLs to relative when possible.""" | |
parsed_url = urlparse(url) | |
parsed_base = urlparse(base_url) | |
if parsed_url.netloc == parsed_base.netloc: | |
return urljoin('/', parsed_url.path) | |
return url | |
def write_cleaned_html(input_file_path, output_directory, cleaned_content): | |
"""Write the cleaned HTML content to a new file in the output directory.""" | |
base_name = os.path.basename(input_file_path) | |
cleaned_file_path = os.path.join(output_directory, base_name) | |
with open(cleaned_file_path, 'w', encoding='utf-8') as file: | |
file.write(cleaned_content) | |
return cleaned_file_path | |
def process_html_files(input_directory, output_directory, base_url): | |
"""Process all HTML files in the input directory and save to the output directory.""" | |
if not os.path.exists(output_directory): | |
os.makedirs(output_directory) | |
html_files = glob.glob(os.path.join(input_directory, "*.html")) | |
results = [] | |
for file_path in html_files: | |
start_time = time.time() | |
original_size = os.path.getsize(file_path) | |
html_content = read_html_file(file_path) | |
cleaned_content = clean_and_optimize_html(html_content, base_url) | |
cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content) | |
cleaned_size = os.path.getsize(cleaned_file_path) | |
end_time = time.time() | |
reduction_percentage = ((original_size - cleaned_size) / original_size) * 100 | |
results.append({ | |
'file_name': os.path.basename(file_path), | |
'input_size': original_size / (1024 * 1024), # Convert to MB | |
'output_size': cleaned_size / (1024 * 1024), # Convert to MB | |
'processing_time': (end_time - start_time) * 1000, # Convert to ms | |
'reduction_percentage': reduction_percentage | |
}) | |
return results | |
def create_csv_report(results, output_directory): | |
"""Create a CSV report with the processing results.""" | |
report_path = os.path.join(output_directory, 'processing_report.csv') | |
with open(report_path, 'w', newline='') as csvfile: | |
fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for result in results: | |
writer.writerow({ | |
'file_name': result['file_name'], | |
'input_size': f"{result['input_size']:.2f}", | |
'output_size': f"{result['output_size']:.2f}", | |
'processing_time': f"{result['processing_time']:.2f}", | |
'reduction_percentage': f"{result['reduction_percentage']:.2f}" | |
}) | |
return report_path | |
# Usage | |
input_directory = '/Users/harit/Downloads/working/amz/input' | |
output_directory = '/Users/harit/Downloads/working/amz/output' | |
base_url = 'https://www.amazon.com' # Adjust this to the actual base URL of your Amazon pages | |
# Process the HTML files | |
results = process_html_files(input_directory, output_directory, base_url) | |
# Create CSV report | |
report_path = create_csv_report(results, output_directory) | |
print(f"CSV report created: {report_path}") | |
# Print CSV content | |
with open(report_path, 'r') as csvfile: | |
print(csvfile.read()) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment