Created
March 13, 2015 23:44
-
-
Save yong27/d82b13e2da1496be07e6 to your computer and use it in GitHub Desktop.
Metagenome basic profiler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from subprocess import Popen, PIPE | |
def len_fasta(filename): | |
p1 = Popen(['cat', filename], stdout=PIPE, stderr=PIPE) | |
p2 = Popen(['grep', '>'], stdin=p1.stdout, stdout=PIPE, stderr=PIPE) | |
p3 = Popen(['wc', '-l'], stdin=p2.stdout, stdout=PIPE, stderr=PIPE) | |
result, err = p3.communicate() | |
if p3.returncode != 0: | |
raise IOError(err) | |
return int(result.strip().split()[0]) | |
def show_report(fasta_filename): | |
print("Number of reads: {}".format(len_fasta(fasta_filename))) | |
blout_filename = fasta_filename.replace('fasta', 'blout') | |
data = pd.read_table(blout_filename, | |
names='qseqid sseqid pident length mismatch gapopen qstart ' \ | |
'qend sstart send evalue bitscore sgi staxids'.split(), | |
) | |
print("Number of annotations: {}".format(len(data))) | |
only_first_taxon = lambda s: s.split(';')[0] | |
data['staxids2'] = data['staxids'].apply(only_first_taxon) | |
subdata = data[['qseqid', 'staxids2', 'sgi']] | |
taxon_counts = subdata.groupby('staxids2').count().sort('qseqid', | |
ascending=False) | |
print("Number of taxons: {}".format(len(taxon_counts))) | |
print("Top 10 taxons") | |
for tax_id, taxon in taxon_counts.iloc[:10].iterrows(): | |
print(" - id {} has {} reads".format(tax_id, taxon['qseqid'])) | |
gene_counts = subdata.groupby('sgi').count().sort('qseqid', | |
ascending=False) | |
print("Number of genes: {}".format(len(gene_counts))) | |
print("Top 10 genes") | |
for gene_id, gene in gene_counts.iloc[:10].iterrows(): | |
print(" - id {} has {} reads".format(gene_id, gene['qseqid'])) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument("fasta", help="input FASTA file") | |
args = parser.parse_args() | |
show_report(args.fasta) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment