abrichr/get_pdf.py

## get_pdf.py
"""
Module to download a PDF from Sci-Hub using a provided DOI.

This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and
saves it to a specified output path.

Example usage:
    python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf"

If no output path is specified, the DOI will be used as the filename with invalid
characters replaced.
"""

import subprocess
import re
import sys

def sanitize_filename(doi: str) -> str:
    """
    Sanitize the DOI to create a valid filename.

    Args:
        doi (str): The DOI to sanitize.

    Returns:
        str: A sanitized filename.
    """
    return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf"

def get_pdf_by_doi(doi: str, output_path: str) -> None:
    """
    Download a PDF from Sci-Hub using the provided DOI.

    Args:
        doi (str): The DOI of the paper to download.
        output_path (str): The path to save the downloaded PDF.

    Raises:
        RuntimeError: If the download fails.
    """
    # Construct the Sci-Hub URL
    sci_hub_url = f"https://sci-hub.se/{doi}"

    # Use curl to fetch the Sci-Hub page
    result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}")

    # Extract the PDF URL from the Sci-Hub page
    pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout)

    if not pdf_url_match:
        raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.")

    pdf_url = pdf_url_match.group(1)

    # Ensure the PDF URL is complete
    if not pdf_url.startswith("http"):
        pdf_url = "https://sci-hub.se" + pdf_url

    # Download the PDF using curl
    download_result = subprocess.run(['curl', '-o', output_path, pdf_url])

    if download_result.returncode != 0:
        raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}")

    print(f"Saved to {output_path}")

def main() -> None:
    """
    Main function to download a PDF by DOI using Sci-Hub.

    Command-line Args:
        doi (str): The DOI of the paper to download.
        output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename.
    """
    if len(sys.argv) < 2:
        print("Usage: python get_pdf.py <DOI> [output_path]")
        sys.exit(1)

    doi = sys.argv[1]
    output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi)

    get_pdf_by_doi(doi, output_path)

if __name__ == "__main__":
    main()
	"""
	Module to download a PDF from Sci-Hub using a provided DOI.

	This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and
	saves it to a specified output path.

	Example usage:
	python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf"

	If no output path is specified, the DOI will be used as the filename with invalid
	characters replaced.
	"""

	import subprocess
	import re
	import sys

	def sanitize_filename(doi: str) -> str:
	"""
	Sanitize the DOI to create a valid filename.

	Args:
	doi (str): The DOI to sanitize.

	Returns:
	str: A sanitized filename.
	"""
	return re.sub(r'[\\/*?:"<>\|]', '_', doi) + ".pdf"

	def get_pdf_by_doi(doi: str, output_path: str) -> None:
	"""
	Download a PDF from Sci-Hub using the provided DOI.

	Args:
	doi (str): The DOI of the paper to download.
	output_path (str): The path to save the downloaded PDF.

	Raises:
	RuntimeError: If the download fails.
	"""
	# Construct the Sci-Hub URL
	sci_hub_url = f"https://sci-hub.se/{doi}"

	# Use curl to fetch the Sci-Hub page
	result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True)

	if result.returncode != 0:
	raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}")

	# Extract the PDF URL from the Sci-Hub page
	pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout)

	if not pdf_url_match:
	raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.")

	pdf_url = pdf_url_match.group(1)

	# Ensure the PDF URL is complete
	if not pdf_url.startswith("http"):
	pdf_url = "https://sci-hub.se" + pdf_url

	# Download the PDF using curl
	download_result = subprocess.run(['curl', '-o', output_path, pdf_url])

	if download_result.returncode != 0:
	raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}")

	print(f"Saved to {output_path}")

	def main() -> None:
	"""
	Main function to download a PDF by DOI using Sci-Hub.

	Command-line Args:
	doi (str): The DOI of the paper to download.
	output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename.
	"""
	if len(sys.argv) < 2:
	print("Usage: python get_pdf.py <DOI> [output_path]")
	sys.exit(1)

	doi = sys.argv[1]
	output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi)

	get_pdf_by_doi(doi, output_path)

	if __name__ == "__main__":
	main()