hhimanshu/strip_html.py

## strip_html.py
#!/usr/bin/env python
# coding: utf-8

# In[ ]:


get_ipython().run_line_magic('pip', 'install -q beautifulsoup4')


# In[ ]:


import os
import glob
import csv
import time
from bs4 import BeautifulSoup

def read_html_file(file_path):
    """Read the contents of an HTML file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def clean_html(html_content):
    """Remove script and style tags from HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove script and style tags
    for script in soup(["script", "style"]):
        script.decompose()

    return str(soup)

def write_cleaned_html(input_file_path, output_directory, cleaned_content):
    """Write the cleaned HTML content to a new file in the output directory."""
    base_name = os.path.basename(input_file_path)
    cleaned_file_path = os.path.join(output_directory, base_name)

    with open(cleaned_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)

    return cleaned_file_path

def process_html_files(input_directory, output_directory):
    """Process all HTML files in the input directory and save to the output directory."""
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    html_files = glob.glob(os.path.join(input_directory, "*.html"))
    results = []

    for file_path in html_files:
        start_time = time.time()
        original_size = os.path.getsize(file_path)
        html_content = read_html_file(file_path)
        cleaned_content = clean_html(html_content)
        cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
        cleaned_size = os.path.getsize(cleaned_file_path)
        end_time = time.time()

        reduction_percentage = ((original_size - cleaned_size) / original_size) * 100

        results.append({
            'file_name': os.path.basename(file_path),
            'input_size': original_size / (1024 * 1024),  # Convert to MB
            'output_size': cleaned_size / (1024 * 1024),  # Convert to MB
            'processing_time': (end_time - start_time) * 1000,  # Convert to ms
            'reduction_percentage': reduction_percentage
        })

    return results

def create_csv_report(results, output_directory):
    """Create a CSV report with the processing results."""
    report_path = os.path.join(output_directory, 'processing_report.csv')

    with open(report_path, 'w', newline='') as csvfile:
        fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow({
                'file_name': result['file_name'],
                'input_size': f"{result['input_size']:.2f}",
                'output_size': f"{result['output_size']:.2f}",
                'processing_time': f"{result['processing_time']:.2f}",
                'reduction_percentage': f"{result['reduction_percentage']:.2f}"
            })

    return report_path

# Usage
input_directory = '/Users/harit/Downloads/working/amz/input'
output_directory = '/Users/harit/Downloads/working/amz/output'

# Process the HTML files
results = process_html_files(input_directory, output_directory)

# Create CSV report
report_path = create_csv_report(results, output_directory)

print(f"CSV report created: {report_path}")

# Print CSV content
with open(report_path, 'r') as csvfile:
    print(csvfile.read())


# In[ ]:


import os
import glob
import csv
import time
import re
from bs4 import BeautifulSoup, Comment
from urllib.parse import urlparse, urljoin

def read_html_file(file_path):
    """Read the contents of an HTML file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def clean_and_optimize_html(html_content, base_url):
    """Clean and optimize HTML content."""
    soup = BeautifulSoup(html_content, 'html.parser')

    # Remove script and style tags
    for script in soup(["script", "style"]):
        script.decompose()

    # Minification: remove comments
    for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # Attribute optimization
    for tag in soup.find_all():
        # Remove default attributes
        if tag.name == 'script' and tag.get('type') == 'text/javascript':
            del tag['type']
        if tag.name in ['div', 'span'] and tag.get('style') == '':
            del tag['style']

    # URL shortening
    for tag in soup.find_all(['a', 'img', 'link', 'script']):
        if tag.has_attr('href'):
            tag['href'] = shorten_url(tag['href'], base_url)
        if tag.has_attr('src'):
            tag['src'] = shorten_url(tag['src'], base_url)

    # Entity encoding optimization
    html_string = str(soup)
    html_string = html_string.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')

    # Remove empty elements
    html_string = re.sub(r'<([a-z]+)>\s*</\1>', '', html_string, flags=re.IGNORECASE)

    # Final minification: remove unnecessary whitespace
    html_string = re.sub(r'\s+', ' ', html_string)
    html_string = re.sub(r'>\s+<', '><', html_string)

    return html_string

def shorten_url(url, base_url):
    """Convert absolute URLs to relative when possible."""
    parsed_url = urlparse(url)
    parsed_base = urlparse(base_url)
    if parsed_url.netloc == parsed_base.netloc:
        return urljoin('/', parsed_url.path)
    return url

def write_cleaned_html(input_file_path, output_directory, cleaned_content):
    """Write the cleaned HTML content to a new file in the output directory."""
    base_name = os.path.basename(input_file_path)
    cleaned_file_path = os.path.join(output_directory, base_name)

    with open(cleaned_file_path, 'w', encoding='utf-8') as file:
        file.write(cleaned_content)

    return cleaned_file_path

def process_html_files(input_directory, output_directory, base_url):
    """Process all HTML files in the input directory and save to the output directory."""
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    html_files = glob.glob(os.path.join(input_directory, "*.html"))
    results = []

    for file_path in html_files:
        start_time = time.time()
        original_size = os.path.getsize(file_path)
        html_content = read_html_file(file_path)
        cleaned_content = clean_and_optimize_html(html_content, base_url)
        cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
        cleaned_size = os.path.getsize(cleaned_file_path)
        end_time = time.time()

        reduction_percentage = ((original_size - cleaned_size) / original_size) * 100

        results.append({
            'file_name': os.path.basename(file_path),
            'input_size': original_size / (1024 * 1024),  # Convert to MB
            'output_size': cleaned_size / (1024 * 1024),  # Convert to MB
            'processing_time': (end_time - start_time) * 1000,  # Convert to ms
            'reduction_percentage': reduction_percentage
        })

    return results

def create_csv_report(results, output_directory):
    """Create a CSV report with the processing results."""
    report_path = os.path.join(output_directory, 'processing_report.csv')

    with open(report_path, 'w', newline='') as csvfile:
        fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for result in results:
            writer.writerow({
                'file_name': result['file_name'],
                'input_size': f"{result['input_size']:.2f}",
                'output_size': f"{result['output_size']:.2f}",
                'processing_time': f"{result['processing_time']:.2f}",
                'reduction_percentage': f"{result['reduction_percentage']:.2f}"
            })

    return report_path

# Usage
input_directory = '/Users/harit/Downloads/working/amz/input'
output_directory = '/Users/harit/Downloads/working/amz/output'
base_url = 'https://www.amazon.com'  # Adjust this to the actual base URL of your Amazon pages

# Process the HTML files
results = process_html_files(input_directory, output_directory, base_url)

# Create CSV report
report_path = create_csv_report(results, output_directory)

print(f"CSV report created: {report_path}")

# Print CSV content
with open(report_path, 'r') as csvfile:
    print(csvfile.read())
	#!/usr/bin/env python
	# coding: utf-8

	# In[ ]:


	get_ipython().run_line_magic('pip', 'install -q beautifulsoup4')


	# In[ ]:


	import os
	import glob
	import csv
	import time
	from bs4 import BeautifulSoup

	def read_html_file(file_path):
	"""Read the contents of an HTML file."""
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()

	def clean_html(html_content):
	"""Remove script and style tags from HTML content."""
	soup = BeautifulSoup(html_content, 'html.parser')

	# Remove script and style tags
	for script in soup(["script", "style"]):
	script.decompose()

	return str(soup)

	def write_cleaned_html(input_file_path, output_directory, cleaned_content):
	"""Write the cleaned HTML content to a new file in the output directory."""
	base_name = os.path.basename(input_file_path)
	cleaned_file_path = os.path.join(output_directory, base_name)

	with open(cleaned_file_path, 'w', encoding='utf-8') as file:
	file.write(cleaned_content)

	return cleaned_file_path

	def process_html_files(input_directory, output_directory):
	"""Process all HTML files in the input directory and save to the output directory."""
	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	html_files = glob.glob(os.path.join(input_directory, "*.html"))
	results = []

	for file_path in html_files:
	start_time = time.time()
	original_size = os.path.getsize(file_path)
	html_content = read_html_file(file_path)
	cleaned_content = clean_html(html_content)
	cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
	cleaned_size = os.path.getsize(cleaned_file_path)
	end_time = time.time()

	reduction_percentage = ((original_size - cleaned_size) / original_size) * 100

	results.append({
	'file_name': os.path.basename(file_path),
	'input_size': original_size / (1024 * 1024), # Convert to MB
	'output_size': cleaned_size / (1024 * 1024), # Convert to MB
	'processing_time': (end_time - start_time) * 1000, # Convert to ms
	'reduction_percentage': reduction_percentage
	})

	return results

	def create_csv_report(results, output_directory):
	"""Create a CSV report with the processing results."""
	report_path = os.path.join(output_directory, 'processing_report.csv')

	with open(report_path, 'w', newline='') as csvfile:
	fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	for result in results:
	writer.writerow({
	'file_name': result['file_name'],
	'input_size': f"{result['input_size']:.2f}",
	'output_size': f"{result['output_size']:.2f}",
	'processing_time': f"{result['processing_time']:.2f}",
	'reduction_percentage': f"{result['reduction_percentage']:.2f}"
	})

	return report_path

	# Usage
	input_directory = '/Users/harit/Downloads/working/amz/input'
	output_directory = '/Users/harit/Downloads/working/amz/output'

	# Process the HTML files
	results = process_html_files(input_directory, output_directory)

	# Create CSV report
	report_path = create_csv_report(results, output_directory)

	print(f"CSV report created: {report_path}")

	# Print CSV content
	with open(report_path, 'r') as csvfile:
	print(csvfile.read())


	# In[ ]:


	import os
	import glob
	import csv
	import time
	import re
	from bs4 import BeautifulSoup, Comment
	from urllib.parse import urlparse, urljoin

	def read_html_file(file_path):
	"""Read the contents of an HTML file."""
	with open(file_path, 'r', encoding='utf-8') as file:
	return file.read()

	def clean_and_optimize_html(html_content, base_url):
	"""Clean and optimize HTML content."""
	soup = BeautifulSoup(html_content, 'html.parser')

	# Remove script and style tags
	for script in soup(["script", "style"]):
	script.decompose()

	# Minification: remove comments
	for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
	comment.extract()

	# Attribute optimization
	for tag in soup.find_all():
	# Remove default attributes
	if tag.name == 'script' and tag.get('type') == 'text/javascript':
	del tag['type']
	if tag.name in ['div', 'span'] and tag.get('style') == '':
	del tag['style']

	# URL shortening
	for tag in soup.find_all(['a', 'img', 'link', 'script']):
	if tag.has_attr('href'):
	tag['href'] = shorten_url(tag['href'], base_url)
	if tag.has_attr('src'):
	tag['src'] = shorten_url(tag['src'], base_url)

	# Entity encoding optimization
	html_string = str(soup)
	html_string = html_string.replace('&', '&').replace('<', '<').replace('>', '>')

	# Remove empty elements
	html_string = re.sub(r'<([a-z]+)>\s*</\1>', '', html_string, flags=re.IGNORECASE)

	# Final minification: remove unnecessary whitespace
	html_string = re.sub(r'\s+', ' ', html_string)
	html_string = re.sub(r'>\s+<', '><', html_string)

	return html_string

	def shorten_url(url, base_url):
	"""Convert absolute URLs to relative when possible."""
	parsed_url = urlparse(url)
	parsed_base = urlparse(base_url)
	if parsed_url.netloc == parsed_base.netloc:
	return urljoin('/', parsed_url.path)
	return url

	def write_cleaned_html(input_file_path, output_directory, cleaned_content):
	"""Write the cleaned HTML content to a new file in the output directory."""
	base_name = os.path.basename(input_file_path)
	cleaned_file_path = os.path.join(output_directory, base_name)

	with open(cleaned_file_path, 'w', encoding='utf-8') as file:
	file.write(cleaned_content)

	return cleaned_file_path

	def process_html_files(input_directory, output_directory, base_url):
	"""Process all HTML files in the input directory and save to the output directory."""
	if not os.path.exists(output_directory):
	os.makedirs(output_directory)

	html_files = glob.glob(os.path.join(input_directory, "*.html"))
	results = []

	for file_path in html_files:
	start_time = time.time()
	original_size = os.path.getsize(file_path)
	html_content = read_html_file(file_path)
	cleaned_content = clean_and_optimize_html(html_content, base_url)
	cleaned_file_path = write_cleaned_html(file_path, output_directory, cleaned_content)
	cleaned_size = os.path.getsize(cleaned_file_path)
	end_time = time.time()

	reduction_percentage = ((original_size - cleaned_size) / original_size) * 100

	results.append({
	'file_name': os.path.basename(file_path),
	'input_size': original_size / (1024 * 1024), # Convert to MB
	'output_size': cleaned_size / (1024 * 1024), # Convert to MB
	'processing_time': (end_time - start_time) * 1000, # Convert to ms
	'reduction_percentage': reduction_percentage
	})

	return results

	def create_csv_report(results, output_directory):
	"""Create a CSV report with the processing results."""
	report_path = os.path.join(output_directory, 'processing_report.csv')

	with open(report_path, 'w', newline='') as csvfile:
	fieldnames = ['file_name', 'input_size', 'output_size', 'processing_time', 'reduction_percentage']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

	writer.writeheader()
	for result in results:
	writer.writerow({
	'file_name': result['file_name'],
	'input_size': f"{result['input_size']:.2f}",
	'output_size': f"{result['output_size']:.2f}",
	'processing_time': f"{result['processing_time']:.2f}",
	'reduction_percentage': f"{result['reduction_percentage']:.2f}"
	})

	return report_path

	# Usage
	input_directory = '/Users/harit/Downloads/working/amz/input'
	output_directory = '/Users/harit/Downloads/working/amz/output'
	base_url = 'https://www.amazon.com' # Adjust this to the actual base URL of your Amazon pages

	# Process the HTML files
	results = process_html_files(input_directory, output_directory, base_url)

	# Create CSV report
	report_path = create_csv_report(results, output_directory)

	print(f"CSV report created: {report_path}")

	# Print CSV content
	with open(report_path, 'r') as csvfile:
	print(csvfile.read())