Skip to content

Instantly share code, notes, and snippets.

@hyeshik
Last active July 31, 2018 01:33
Show Gist options
  • Save hyeshik/709a7dfbc629a76801171e8a4c417c22 to your computer and use it in GitHub Desktop.
Save hyeshik/709a7dfbc629a76801171e8a4c417c22 to your computer and use it in GitHub Desktop.
Download WASET conference abstracts
#!/usr/bin/env python3
#
# Copyright (c) 2018 Hyeshik Chang
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
from urllib.request import urlopen, Request
from urllib.error import HTTPError
import time
import bsddb3
import random
import sys
import numpy as np
class WasetAbstractRetriever:
FIRST_ABSTRACT_NUMBER = 0
LAST_ABSTRACT_NUMBER = 100000 # Change this when there're more updates
DELAY_DURATIONS = [1, 2, 4, 5] # Randomly chosen
JSON_URL_FORMAT = 'https://waset.org/Publications/XML?abstract={number}&t=json'
STATUS_PERIOD = 25
HTTP_HEADER = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
def __init__(self, dbfile):
self.db = bsddb3.hashopen(dbfile, 'c')
self.abstract_numbers = self.initiate_article_ids()
def initiate_article_ids(self):
artids = []
for nid in range(self.FIRST_ABSTRACT_NUMBER, self.LAST_ABSTRACT_NUMBER):
if str(nid).encode() not in self.db:
artids.append(nid)
random.shuffle(artids)
print('=> {0} abstracts to go ({1} done)'.format(len(artids), len(self.db)))
return artids
def retrieve_article(self, artid):
dbkey = str(artid).encode()
url = self.JSON_URL_FORMAT.format(number=artid)
if dbkey in self.db:
return
req = Request(url, data=None, headers=self.HTTP_HEADER)
try:
print('--> Retrieving', artid, end='\t')
sys.stdout.flush()
self.db[dbkey] = urlopen(req).read()
except HTTPError as err:
self.db[dbkey] = 'HTTP{}'.format(err.code).encode()
print('-', err.code)
else:
print('- okay')
self.db.sync()
def run(self):
for i, artid in enumerate(self.abstract_numbers):
self.retrieve_article(artid)
if (i + 1) % self.STATUS_PERIOD == 0:
remaining = len(self.abstract_numbers) - (i + 1)
print('=> {0} abstracts to go ({1} done)'.format(remaining, len(self.db)))
time.sleep(random.choice(self.DELAY_DURATIONS))
if __name__ == '__main__':
WasetAbstractRetriever('waset-abstracts.db').run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment