sunlightlabs/parse_house_disclosures.py

## parse_house_disclosures.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

'''
    This script was used to convert the 2009 Q3 House Disbursement PDF into detail and summary CSV files.
        Source PDF: http://disbursements.house.gov/
        Resulting Data: http://www.sunlightfoundation.com/projects/2009/expenditures/

    It was originally authored by Luke Rosiak with improvements by James Turk for Sunlight Labs and is released into the public domain.

    Disclaimer: It was written quickly under deadline and likely contains a few bugs - patches welcome

    It expects a file "members-only.txt" created as the result of the following two operations:
       pdftk cat 241 2780 output members-only.pdf
       pdftotext -layout members-only.pdf
'''

import csv, re, sys

BAD_LINE_RE = re.compile('^(Frm|Fmt|Sfmt|Jkt|VerDate|VOUCHER|OFFICIAL\sEXPENSES|MEMBERS\sREPRESENATION\sALLOW|PO|APPS06|PsN:|09:47|M:)')

def known_bad(line):
    return (not line) or BAD_LINE_RE.match(line) or 'dkrause' in line

def main():
    f = open('members-only.txt', "r")

    fsummary = csv.writer(open("house-disburse-summary.csv", "w"), quoting=csv.QUOTE_ALL)
    fdetail = csv.writer(open("house-disburse-detail.csv", "w"), quoting=csv.QUOTE_ALL)
    trashcan = open('trashlines.txt','w')

    cats = ['FRANKED MAIL', 'PERSONNEL COMPENSATION', 'PERSONNEL BENEFITS', 'TRAVEL', 'RENT, COMMUNICATION, UTILITIES', 'PRINTING AND REPRODUCTION', 'OTHER SERVICES', 'SUPPLIES AND MATERIALS', 'EQUIPMENT']

    thismem = ''
    thiscat = ''
    thisyear = ''

    regular_re = re.compile(r"""(\d{2}-\d{2})\s+            # date
                            ([0-9A-Z]{2})\s+                # transaction code
                            ([0-9A-Z]+)\s+                  # record id
                            (.*?)                           # recipient
                            (\d{2}/{1}\d{2}/{1}\d{2})\s+    # date-start
                            (\d{2}/{1}\d{2}/{1}\d{2})       # date-end
                            (.*?)\s+                        # description
                            (-?[0-9,]+\.\d{2})              # amount
                            """, re.VERBOSE)
    personel_re = re.compile(r"""(.*?)                      # recipient
                             (\d{2}/{1}\d{2}/{1}\d{2})\s+   # date-start
                             (\d{2}/{1}\d{2}/{1}\d{2})      # date-end
                             (.*?)\s+                       # description
                             (-?[0-9,]+\.\d{2})             # amount
                             """, re.VERBOSE)
    summary_re = re.compile(r"""(.*?)\.+\s+         # category
                            (-?[0-9,]+\.\d{2})\s+   # 2009
                            (-?[0-9,]+\.\d{2})      # 2009-Q3
                            """, re.VERBOSE)

    for l in f.readlines():

        # replace UTF-8 minus with normal dash and strip
        l = l.replace('–','-').strip()

        # new member
        if l.startswith("2008 ") or l.startswith("2009 ") or l.startswith('2007 ') or l.startswith("FISCAL YEAR "):
            thismem = l.replace('—', '')[5:]
            thisyear = l[:4]
            if thismem.endswith("Con."):
                thismem = thismem[:-4]
            continue

        # category
        if l in cats:
            thiscat = l
            continue

        #regular record
        ma = regular_re.search(l)
        if ma:
            m = ma.groups()
            date1 = m[0].replace('–', '-')
            transcode = m[1]
            recordid = m[2]
            recip = m[3].strip().rstrip('.')
            sunrecip = recip
            if recip=='DO ':
                sunrecip = oldrecip
            else:
                oldrecip = recip
            date2 = m[4]
            date3 = m[5]
            descrip = m[6].strip().rstrip('.')
            amount = m[7]

            fdetail.writerow([thismem, thisyear, thiscat, date1, transcode, recordid, sunrecip, recip, date2, date3, descrip, amount])
            continue

        # personel record
        ma = personel_re.search(l)
        if ma:
            m = ma.groups()
            recip = m[0].strip().rstrip('.')
            sunrecip = recip
            if recip=='DO ':
                sunrecip = oldrecip
            else:
                oldrecip = recip
            date2 = m[1]
            date3 = m[2]
            descrip = m[3].strip().rstrip('.')
            amount = m[4]
            fdetail.writerow([thismem, thisyear, thiscat, "", "", "", sunrecip, recip, date2, date3, descrip, amount])
            continue

        # summary record
        ma = summary_re.search(l)
        if ma:
            m = ma.groups()
            if m[0].strip() in cats:
                fsummary.writerow([thismem, m[0], m[1], m[2]])
                continue

        if not known_bad(l):
            trashcan.write(l)

if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	'''
	This script was used to convert the 2009 Q3 House Disbursement PDF into detail and summary CSV files.
	Source PDF: http://disbursements.house.gov/
	Resulting Data: http://www.sunlightfoundation.com/projects/2009/expenditures/

	It was originally authored by Luke Rosiak with improvements by James Turk for Sunlight Labs and is released into the public domain.

	Disclaimer: It was written quickly under deadline and likely contains a few bugs - patches welcome

	It expects a file "members-only.txt" created as the result of the following two operations:
	pdftk cat 241 2780 output members-only.pdf
	pdftotext -layout members-only.pdf
	'''

	import csv, re, sys

	BAD_LINE_RE = re.compile('^(Frm\|Fmt\|Sfmt\|Jkt\|VerDate\|VOUCHER\|OFFICIAL\sEXPENSES\|MEMBERS\sREPRESENATION\sALLOW\|PO\|APPS06\|PsN:\|09:47\|M:)')

	def known_bad(line):
	return (not line) or BAD_LINE_RE.match(line) or 'dkrause' in line

	def main():
	f = open('members-only.txt', "r")

	fsummary = csv.writer(open("house-disburse-summary.csv", "w"), quoting=csv.QUOTE_ALL)
	fdetail = csv.writer(open("house-disburse-detail.csv", "w"), quoting=csv.QUOTE_ALL)
	trashcan = open('trashlines.txt','w')

	cats = ['FRANKED MAIL', 'PERSONNEL COMPENSATION', 'PERSONNEL BENEFITS', 'TRAVEL', 'RENT, COMMUNICATION, UTILITIES', 'PRINTING AND REPRODUCTION', 'OTHER SERVICES', 'SUPPLIES AND MATERIALS', 'EQUIPMENT']

	thismem = ''
	thiscat = ''
	thisyear = ''

	regular_re = re.compile(r"""(\d{2}-\d{2})\s+ # date
	([0-9A-Z]{2})\s+ # transaction code
	([0-9A-Z]+)\s+ # record id
	(.*?) # recipient
	(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start
	(\d{2}/{1}\d{2}/{1}\d{2}) # date-end
	(.*?)\s+ # description
	(-?[0-9,]+\.\d{2}) # amount
	""", re.VERBOSE)
	personel_re = re.compile(r"""(.*?) # recipient
	(\d{2}/{1}\d{2}/{1}\d{2})\s+ # date-start
	(\d{2}/{1}\d{2}/{1}\d{2}) # date-end
	(.*?)\s+ # description
	(-?[0-9,]+\.\d{2}) # amount
	""", re.VERBOSE)
	summary_re = re.compile(r"""(.*?)\.+\s+ # category
	(-?[0-9,]+\.\d{2})\s+ # 2009
	(-?[0-9,]+\.\d{2}) # 2009-Q3
	""", re.VERBOSE)

	for l in f.readlines():

	# replace UTF-8 minus with normal dash and strip
	l = l.replace('–','-').strip()

	# new member
	if l.startswith("2008 ") or l.startswith("2009 ") or l.startswith('2007 ') or l.startswith("FISCAL YEAR "):
	thismem = l.replace('—', '')[5:]
	thisyear = l[:4]
	if thismem.endswith("Con."):
	thismem = thismem[:-4]
	continue

	# category
	if l in cats:
	thiscat = l
	continue

	#regular record
	ma = regular_re.search(l)
	if ma:
	m = ma.groups()
	date1 = m[0].replace('–', '-')
	transcode = m[1]
	recordid = m[2]
	recip = m[3].strip().rstrip('.')
	sunrecip = recip
	if recip=='DO ':
	sunrecip = oldrecip
	else:
	oldrecip = recip
	date2 = m[4]
	date3 = m[5]
	descrip = m[6].strip().rstrip('.')
	amount = m[7]

	fdetail.writerow([thismem, thisyear, thiscat, date1, transcode, recordid, sunrecip, recip, date2, date3, descrip, amount])
	continue

	# personel record
	ma = personel_re.search(l)
	if ma:
	m = ma.groups()
	recip = m[0].strip().rstrip('.')
	sunrecip = recip
	if recip=='DO ':
	sunrecip = oldrecip
	else:
	oldrecip = recip
	date2 = m[1]
	date3 = m[2]
	descrip = m[3].strip().rstrip('.')
	amount = m[4]
	fdetail.writerow([thismem, thisyear, thiscat, "", "", "", sunrecip, recip, date2, date3, descrip, amount])
	continue

	# summary record
	ma = summary_re.search(l)
	if ma:
	m = ma.groups()
	if m[0].strip() in cats:
	fsummary.writerow([thismem, m[0], m[1], m[2]])
	continue

	if not known_bad(l):
	trashcan.write(l)

	if __name__ == '__main__':
	main()