Last active
May 19, 2020 18:19
-
-
Save yong27/5873098 to your computer and use it in GitHub Desktop.
GEO file downloader from query search result CSV
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
""" | |
GEO file downloader | |
Requirements | |
* beautifulsoup4 (pip install lxml beautifulsoup4) | |
* wget | |
Usage | |
1. get CSV file in the GEO browse page. | |
http://www.ncbi.nlm.nih.gov/geo/browse/?view=series&tax=9606 url is for | |
Human series. In the web page, click Export and download CSV files. | |
2. python geo_downloader.py < above.csv | |
This command make directory "series/geo_accession" and get the geo | |
series matrix file and rawfile from the server, and | |
"platform/geo_accession" file with that platform probe information. | |
""" | |
import os | |
import csv | |
import time | |
import urllib2 | |
from ftplib import FTP | |
import subprocess | |
from bs4 import BeautifulSoup | |
NETWORK_WAIT_SEC = 30 | |
def urlopen(url, wait_sec=NETWORK_WAIT_SEC): | |
result = None | |
while not result: | |
try: | |
result = urllib2.urlopen(url) | |
except Exception, e: | |
print 'url %s has problems, try again after %s sec...' % ( | |
url, wait_sec) | |
print str(e) | |
time.sleep(wait_sec) | |
return result | |
class GeoParser(object): | |
geo_url = 'http://www.ncbi.nlm.nih.gov' | |
geo_query_url = geo_url + '/geo/query/' | |
def __init__(self, geo_id): | |
self.geo_id = geo_id | |
self.url = '%sacc.cgi?acc=%s' % (self.geo_query_url, geo_id) | |
html = urlopen(self.url) | |
self.soup = BeautifulSoup(html, 'lxml') | |
def parse_main_table(self): | |
table = self.soup.find('table', cellpadding='2', cellspacing='0', | |
width='600') | |
tr = table.tr.next_sibling.next_sibling | |
data = {} | |
while True: | |
try: | |
key = tr.td.getText() | |
value = tr.td.next_sibling.next_sibling.getText() | |
data[key] = value | |
tr = tr.next_sibling.next_sibling | |
except AttributeError: | |
break | |
return data | |
class SampleParser(GeoParser): | |
def __init__(self, sample_id, series_id, platform_id): | |
super(SampleParser, self).__init__(sample_id) | |
self.sample_id = sample_id | |
self.series_id = series_id | |
self.platform_id = platform_id | |
def get_sample_protocol_data(self): | |
d = self.parse_main_table() | |
sample_data = { | |
'sample_nm': self.sample_id, | |
'title': d.get('Title'), | |
'source_nm': d.get('Source name'), | |
'organism': d.get('Organism'), | |
'characteristics_tag': d.get('Characteristics'), | |
'biomaterial_provider': d.get('Biomaterial provider'), | |
'molecule': d.get('Extracted molecule'), | |
'label': d.get('Label'), | |
'description': d.get('Description'), | |
} | |
protocol_data = { | |
'growth': d.get('Growth protocol'), | |
'treatment': d.get('Treatment protocol'), | |
'extract': d.get('Extraction protocol'), | |
'label': d.get('Label protocol'), | |
'hybridization': d.get('Hybridization protocol'), | |
'scan': d.get('Scan protocol'), | |
'data_processing': d.get('Data processing'), | |
'value_definition': d.get('Value definition'), | |
} | |
return sample_data, protocol_data | |
class SeriesParser(GeoParser): | |
def __init__(self, geo_id): | |
super(SeriesParser, self).__init__(geo_id) | |
self.series_id = geo_id | |
self.target_path = os.path.join('series', self.series_id) | |
def get_series_data(self): | |
d = self.parse_main_table() | |
series_data = { | |
'geo_series_id': self.series_id, | |
'title': d.get('Title'), | |
'summary': d.get('Summary'), | |
'overall_design': d.get('Overall design'), | |
'contributor': d.get('Contributor(s)'), | |
'url': self.url, | |
'type': d.get('Experiment type', ''), | |
'organism': d.get('Sample organism'), | |
} | |
print 'series data...', series_data | |
return series_data | |
def get_platform_id(self): | |
table = self.soup.find('table', | |
style='position:relative;top:-5px;left:-5px') | |
platform_id = table.tr.td.string | |
return platform_id | |
def get_sample_ids(self): | |
table = self.soup.find('table', | |
style='position:relative;top:-5px;left:-5px') | |
tr = table.parent.parent.next_sibling.next_sibling.tr | |
sample_ids = [] | |
while tr: | |
sample_id = tr.td.string | |
sample_ids.append(sample_id) | |
tr = tr.next_sibling.next_sibling | |
script = table.parent.parent.next_sibling.next_sibling.script | |
tr = script.next_sibling.next_sibling.table.tr | |
while tr: | |
sample_id = tr.td.string | |
sample_ids.append(sample_id) | |
try: | |
tr = tr.next_sibling.next_sibling | |
except AttributeError: | |
break | |
return sample_ids | |
def get_matrix_url(self): | |
domain = 'ftp.ncbi.nlm.nih.gov' | |
path = 'pub/geo/DATA/SeriesMatrix/%s' % self.series_id | |
files = [] | |
ftp = None | |
while not ftp: | |
try: | |
ftp = FTP(domain) | |
except: | |
print 'ftp connection has problem. retry...' | |
time.sleep(NETWORK_WAIT_SEC) | |
ftp.login() | |
ftp.set_pasv(True) | |
ftp.cwd(path) | |
ftp.dir(files.append) | |
filename = files[0].split()[-1] | |
return "ftp://%s/%s/%s" % (domain, path, filename) | |
def get_matrix_filename(self): | |
url = self.get_matrix_url() | |
print 'matrix_url...', url | |
p = subprocess.Popen(['wget', '-P', self.target_path, url], | |
stdout=subprocess.PIPE) | |
output = p.communicate()[0] | |
filename = url.split('/')[-1] | |
p = subprocess.Popen(['gunzip', '-f', | |
os.path.join(self.target_path, filename)], | |
stdout=subprocess.PIPE) | |
output = p.communicate()[0] | |
filename = filename.replace('.gz', '') | |
return os.path.join(self.target_path, filename) | |
def get_matrix_rawfiles(self): | |
rawfile_url = None | |
matrix = [] | |
with open(self.get_matrix_filename()) as f: | |
for line in f: | |
if line.startswith('!Series_supplementary_file'): | |
rawfile_url = line.split()[-1].replace('"', '') | |
if not line.startswith('!'): | |
line = line.strip() | |
words = [w.replace('"', '') for w in line.split()] | |
if words: | |
matrix.append(words) | |
if rawfile_url: | |
self.download_rawfiles(rawfile_url) | |
return matrix | |
def download_rawfiles(self, rawfile_url): | |
p = subprocess.Popen(['wget', '-P', self.target_path, rawfile_url], | |
stdout=subprocess.PIPE) | |
output = p.communicate()[0] | |
print output | |
class PlatformParser(GeoParser): | |
def __init__(self, geo_id): | |
super(PlatformParser, self).__init__(geo_id) | |
self.platform_id = geo_id | |
def get_platform_data(self): | |
table = self.soup.find('table', cellpadding='2', cellspacing='0', | |
width='600') | |
tr = table.tr.next_sibling.next_sibling | |
status = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling | |
title = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling | |
technology_type = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling | |
distribution = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling | |
organism = tr.contents[2].contents[0].string | |
tr = tr.next_sibling.next_sibling | |
manufacturer = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling | |
manufacture_protocol = tr.contents[2].contents[0] | |
tr = tr.next_sibling.next_sibling.next_sibling.next_sibling | |
description = '\n'.join(str(s) for s in tr.contents[2]) | |
tr = tr.next_sibling.next_sibling.next_sibling.next_sibling | |
web_link = tr.contents[2].contents[0].string | |
platform_data = { | |
'geo_platform_id': self.platform_id, | |
'title': title, | |
'technology': technology_type, | |
'distribution': distribution, | |
'organism': organism, | |
'manufacturer': manufacturer, | |
'manufacture_protocol': manufacture_protocol, | |
'description': description, | |
'url': web_link, | |
} | |
print 'platform data...', platform_data | |
return platform_data | |
def get_probe_file(self): | |
input = self.soup.find('input', value='Download full table...') | |
probe_url = self.geo_url + input.attrs['onclick' | |
].split(',')[0].replace("OpenLink('", '')[:-1] | |
print 'probe_url...', probe_url | |
return urlopen(probe_url) | |
def write_probe_file(self): | |
try: | |
os.mkdir('platform') | |
except: | |
pass | |
with open(os.path.join('platform', self.platform_id), 'w') as outfile: | |
outfile.write(self.get_probe_file().read()) | |
def main(csvfile): | |
csvfile.next() | |
platforms = [] | |
for record in csv.reader(csvfile): | |
geo_id = record[0] | |
print 'get geo series %s...' % geo_id | |
sp = SeriesParser(geo_id) | |
sp.get_matrix_rawfiles() | |
platform_id = sp.get_platform_id() | |
if platform_id not in platforms: | |
pp = PlatformParser(platform_id) | |
pp.write_probe_file() | |
platforms.append(platform_id) | |
print ' %s done!' % geo_id | |
print '----------------------------' | |
if __name__ == '__main__': | |
import sys | |
main(sys.stdin) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment