Last active
August 12, 2024 17:22
-
-
Save abrichr/455f0e569bf1bd104c696a7ad9e6b20f to your computer and use it in GitHub Desktop.
Get a PDF by its DOI
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Module to download a PDF from Sci-Hub using a provided DOI. | |
This script uses curl to fetch a PDF from Sci-Hub based on the provided DOI and | |
saves it to a specified output path. | |
Example usage: | |
python get_pdf.py "10.1038/s41586-020-2649-2" "output.pdf" | |
If no output path is specified, the DOI will be used as the filename with invalid | |
characters replaced. | |
""" | |
import subprocess | |
import re | |
import sys | |
def sanitize_filename(doi: str) -> str: | |
""" | |
Sanitize the DOI to create a valid filename. | |
Args: | |
doi (str): The DOI to sanitize. | |
Returns: | |
str: A sanitized filename. | |
""" | |
return re.sub(r'[\\/*?:"<>|]', '_', doi) + ".pdf" | |
def get_pdf_by_doi(doi: str, output_path: str) -> None: | |
""" | |
Download a PDF from Sci-Hub using the provided DOI. | |
Args: | |
doi (str): The DOI of the paper to download. | |
output_path (str): The path to save the downloaded PDF. | |
Raises: | |
RuntimeError: If the download fails. | |
""" | |
# Construct the Sci-Hub URL | |
sci_hub_url = f"https://sci-hub.se/{doi}" | |
# Use curl to fetch the Sci-Hub page | |
result = subprocess.run(['curl', '-sL', sci_hub_url], capture_output=True, text=True) | |
if result.returncode != 0: | |
raise RuntimeError(f"Failed to fetch the Sci-Hub page: {result.stderr}") | |
# Extract the PDF URL from the Sci-Hub page | |
pdf_url_match = re.search(r'src="([^"]+\.pdf[^"]*)"', result.stdout) | |
if not pdf_url_match: | |
raise RuntimeError("Failed to find the PDF URL on the Sci-Hub page.") | |
pdf_url = pdf_url_match.group(1) | |
# Ensure the PDF URL is complete | |
if not pdf_url.startswith("http"): | |
pdf_url = "https://sci-hub.se" + pdf_url | |
# Download the PDF using curl | |
download_result = subprocess.run(['curl', '-o', output_path, pdf_url]) | |
if download_result.returncode != 0: | |
raise RuntimeError(f"Failed to download the PDF: {download_result.stderr}") | |
print(f"Saved to {output_path}") | |
def main() -> None: | |
""" | |
Main function to download a PDF by DOI using Sci-Hub. | |
Command-line Args: | |
doi (str): The DOI of the paper to download. | |
output_path (str, optional): The path to save the downloaded PDF. Defaults to using the DOI as filename. | |
""" | |
if len(sys.argv) < 2: | |
print("Usage: python get_pdf.py <DOI> [output_path]") | |
sys.exit(1) | |
doi = sys.argv[1] | |
output_path = sys.argv[2] if len(sys.argv) > 2 else sanitize_filename(doi) | |
get_pdf_by_doi(doi, output_path) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment