Created
December 2, 2009 20:32
-
-
Save sunlightlabs/247540 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
''' | |
This script was used to convert the 2009 Q3 House Disbursement PDF into detail and summary CSV files. | |
Source PDF: http://disbursements.house.gov/ | |
Resulting Data: http://www.sunlightfoundation.com/projects/2009/expenditures/ | |
It was originally authored by Luke Rosiak with improvements by James Turk for Sunlight Labs and is released into the public domain. | |
Disclaimer: It was written quickly under deadline and likely contains a few bugs - patches welcome | |
It expects a file "members-only.txt" created as the result of the following two operations: | |
pdftk cat 241 2780 output members-only.pdf | |
pdftotext -layout members-only.pdf | |
''' | |
import csv, re, sys | |
BAD_LINE_RE = re.compile('^(Frm|Fmt|Sfmt|Jkt|VerDate|VOUCHER|OFFICIAL\sEXPENSES|MEMBERS\sREPRESENATION\sALLOW|PO|APPS06|PsN:|09:47|M:)') | |
def known_bad(line): | |
return (not line) or BAD_LINE_RE.match(line) or 'dkrause' in line | |
def main(): | |
f = open('members-only.txt', "r") | |
fsummary = csv.writer(open("house-disburse-summary.csv", "w"), quoting=csv.QUOTE_ALL) | |
fdetail = csv.writer(open("house-disburse-detail.csv", "w"), quoting=csv.QUOTE_ALL) | |
trashcan = open('trashlines.txt','w') | |
cats = ['FRANKED MAIL', 'PERSONNEL COMPENSATION', 'PERSONNEL BENEFITS', 'TRAVEL', 'RENT, COMMUNICATION, UTILITIES', 'PRINTING AND REPRODUCTION', 'OTHER SERVICES', 'SUPPLIES AND MATERIALS', 'EQUIPMENT'] | |
thismem = '' | |
thiscat = '' | |
thisyear = '' | |
regular_re = re.compile(r"""(\d{2}-\d{2})\s+ # date | |
([0-9A-Z]{2})\s+ # transaction code | |
([0-9A-Z]+)\s+ # record id | |
(.*?) # recipient | |
(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start | |
(\d{2}/{1}\d{2}/{1}\d{2}) # date-end | |
(.*?)\s+ # description | |
(-?[0-9,]+\.\d{2}) # amount | |
""", re.VERBOSE) | |
personel_re = re.compile(r"""(.*?) # recipient | |
(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start | |
(\d{2}/{1}\d{2}/{1}\d{2}) # date-end | |
(.*?)\s+ # description | |
(-?[0-9,]+\.\d{2}) # amount | |
""", re.VERBOSE) | |
summary_re = re.compile(r"""(.*?)\.+\s+ # category | |
(-?[0-9,]+\.\d{2})\s+ # 2009 | |
(-?[0-9,]+\.\d{2}) # 2009-Q3 | |
""", re.VERBOSE) | |
for l in f.readlines(): | |
# replace UTF-8 minus with normal dash and strip | |
l = l.replace('–','-').strip() | |
# new member | |
if l.startswith("2008 ") or l.startswith("2009 ") or l.startswith('2007 ') or l.startswith("FISCAL YEAR "): | |
thismem = l.replace('—', '')[5:] | |
thisyear = l[:4] | |
if thismem.endswith("Con."): | |
thismem = thismem[:-4] | |
continue | |
# category | |
if l in cats: | |
thiscat = l | |
continue | |
#regular record | |
ma = regular_re.search(l) | |
if ma: | |
m = ma.groups() | |
date1 = m[0].replace('–', '-') | |
transcode = m[1] | |
recordid = m[2] | |
recip = m[3].strip().rstrip('.') | |
sunrecip = recip | |
if recip=='DO ': | |
sunrecip = oldrecip | |
else: | |
oldrecip = recip | |
date2 = m[4] | |
date3 = m[5] | |
descrip = m[6].strip().rstrip('.') | |
amount = m[7] | |
fdetail.writerow([thismem, thisyear, thiscat, date1, transcode, recordid, sunrecip, recip, date2, date3, descrip, amount]) | |
continue | |
# personel record | |
ma = personel_re.search(l) | |
if ma: | |
m = ma.groups() | |
recip = m[0].strip().rstrip('.') | |
sunrecip = recip | |
if recip=='DO ': | |
sunrecip = oldrecip | |
else: | |
oldrecip = recip | |
date2 = m[1] | |
date3 = m[2] | |
descrip = m[3].strip().rstrip('.') | |
amount = m[4] | |
fdetail.writerow([thismem, thisyear, thiscat, "", "", "", sunrecip, recip, date2, date3, descrip, amount]) | |
continue | |
# summary record | |
ma = summary_re.search(l) | |
if ma: | |
m = ma.groups() | |
if m[0].strip() in cats: | |
fsummary.writerow([thismem, m[0], m[1], m[2]]) | |
continue | |
if not known_bad(l): | |
trashcan.write(l) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment