Skip to content

Instantly share code, notes, and snippets.

@abrichr
Last active August 12, 2024 17:22
Show Gist options
  • Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.
Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.
Get a PDF by its DOI
"""
Module to download a PDF from Sci-Hub using a provided DOI.
This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and
saves it to a specified output path.
Example usage:
python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf"
If no output path is specified, the DOI will be used as the filename with invalid
characters replaced.
"""
import subprocess
import re
import sys
def sanitize_filename(doi: str) -> str:
"""
Sanitize the DOI to create a valid filename.
Args:
doi (str): The DOI to sanitize.
Returns:
str: A sanitized filename.
"""
return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf"
def get_pdf_by_doi(doi: str, output_path: str) -> None:
"""
Download a PDF from Sci-Hub using the provided DOI.
Args:
doi (str): The DOI of the paper to download.
output_path (str): The path to save the downloaded PDF.
Raises:
RuntimeError: If the download fails.
"""
# Construct the Sci-Hub URL
sci_hub_url = f"https://sci-hub.se/{doi}"
# Use curl to fetch the Sci-Hub page
result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True)
if result.returncode != 0:
raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}")
# Extract the PDF URL from the Sci-Hub page
pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout)
if not pdf_url_match:
raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.")
pdf_url = pdf_url_match.group(1)
# Ensure the PDF URL is complete
if not pdf_url.startswith("http"):
pdf_url = "https://sci-hub.se" + pdf_url
# Download the PDF using curl
download_result = subprocess.run(['curl', '-o', output_path, pdf_url])
if download_result.returncode != 0:
raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}")
print(f"Saved to {output_path}")
def main() -> None:
"""
Main function to download a PDF by DOI using Sci-Hub.
Command-line Args:
doi (str): The DOI of the paper to download.
output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename.
"""
if len(sys.argv) < 2:
print("Usage: python get_pdf.py <DOI> [output_path]")
sys.exit(1)
doi = sys.argv[1]
output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi)
get_pdf_by_doi(doi, output_path)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment