Skip to content

Instantly share code, notes, and snippets.

@binux
Forked from bigsquirrel/parse_dblp.py
Last active August 29, 2015 14:20
Show Gist options
  • Save binux/7bcdcac8c5959c4c50b8 to your computer and use it in GitHub Desktop.
Save binux/7bcdcac8c5959c4c50b8 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# filename: parse_dblp.py
# author: ivanchou
import codecs, os
import xml.etree.ElementTree as ET
paper_tag = ('article','inproceedings','proceedings','book',
'incollection','phdthesis','mastersthesis','www')
class AllEntities:
def __getitem__(self, key):
return key
print ('----------parse begin----------')
result = codecs.open('authors','w','utf-8')
parser = ET.XMLParser()
parser.parser.UseForeignDTD(True)
parser.entity = AllEntities()
for event, article in ET.iterparse('dblp_part.xml'):
for author in article.findall('author'):
result.write(author.text + u'|')
if event == 'end' and article.tag in paper_tag:
result.write(os.linesep)
print ('----------parse end----------')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment