Created
May 14, 2021 15:31
-
-
Save neubig/246f9021049b23910dbe4e01ba5da86b to your computer and use it in GitHub Desktop.
Convert OpenReview IDs to Semantic Scholar Papers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import openreview | |
import argparse | |
import requests | |
import time | |
import sys | |
import csv | |
import json | |
from tqdm import tqdm # Progress bar | |
# This is a utility script to get a CSV of papers from semantic scholar given OpenReview ids | |
parser = argparse.ArgumentParser(description='Description of your program') | |
parser.add_argument('--username', help='OpenReview username', required=True) | |
parser.add_argument('--password', help='OpenReview password', required=True) | |
parser.add_argument('--csv_input', help='A seed csv file, useful if you want to limit queries to s2', default=None) | |
parser.add_argument('--csv_output', help='Where to output the csv file', default='s2_expertise.csv') | |
parser.add_argument('--reviewer_list', help='A list of reviewers, one OpenReview ID per line', default='tilde_members.txt') | |
parser.add_argument('--baseurl', help='url for openreview', default='https://api.openreview.net') | |
args = parser.parse_args() | |
or_client = openreview.Client(baseurl=args.baseurl, username=args.username, password=args.password) | |
def orid_to_s2id(orid): | |
try: | |
profile = or_client.get_profile(orid) | |
s2url = profile.content.get('semanticScholar', None) | |
s2id = int(s2url.split('/')[-1]) if s2url else None | |
return s2id | |
except Exception as e: | |
print(f'Error getting OR profile for {orid}: {e}', file=sys.stderr) | |
return None | |
sleep_time = 1 | |
def query_api(url, session): | |
global sleep_time | |
time.sleep(sleep_time / 1000.0) | |
r = session.get(url) | |
while r.status_code == 429: | |
sleep_time *= 2 | |
print( | |
f'WARNING: Hit rate limit. Increasing sleep to {sleep_time} ms', | |
file=sys.stderr, | |
) | |
time.sleep(sleep_time / 1000.0) | |
r = session.get(url) | |
if r.status_code != 200: | |
print(f'WARNING: Could not access url {url}', file=sys.stderr) | |
return None | |
else: | |
return r.json() | |
papers_map = {} | |
if args.csv_input: | |
with open(args.csv_input, 'r') as csvfile: | |
for entry in csv.reader(csvfile, delimiter=','): | |
papers_map[entry[1]] = (entry[2], entry[3]) | |
papers_list = [] | |
with requests.Session() as session, open(args.reviewer_list, 'r') as reviewer_file, open(args.csv_output, 'w', newline='') as csvfile: | |
csvwriter = csv.writer(csvfile, delimiter=',') | |
for my_orid in tqdm(reviewer_file): | |
my_orid = my_orid.strip() | |
# Get S2 ID from OR ID | |
my_s2id = orid_to_s2id(my_orid) | |
if not my_s2id: continue | |
# Get S2 user | |
user = query_api(f'http://api.semanticscholar.org/v1/author/{my_s2id}', session) | |
if not user: continue | |
# Get S2 IDs | |
# print(json.dumps(user)) | |
for paper in user['papers']: | |
my_pid = paper['paperId'] | |
# Retrieve from already-saved papers | |
if my_pid in papers_map: | |
my_title, my_abstract = papers_map[my_pid] | |
csvwriter.writerow((my_orid, my_pid, my_title, my_abstract)) | |
# Retrieve from S2 | |
else: | |
paper = query_api(f'https://api.semanticscholar.org/v1/paper/{my_pid}', session) | |
if paper: | |
# print(json.dumps(paper)) | |
csvwriter.writerow((my_orid, my_pid, paper['title'], paper['abstract'])) | |
papers_map[my_pid] = (paper['title'], paper['abstract']) | |
print((my_orid, my_pid, paper['title'], paper['abstract']), file=sys.stderr) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment