Last active
April 22, 2020 02:44
-
-
Save edwinhu/5ac05d0d261e62fa5655b2bf7bff8082 to your computer and use it in GitHub Desktop.
get and section adv2 brochure item 11s
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# curl to get all of the zip files of PDFs | |
curl https://www.sec.gov/foia/docs/adv/formadv_part2_[1-112].zip -o "/data/hue/adv2/formadv_part2_#1.zip" | |
# list files | |
unzip -l formadv_part2_1.zip | |
# awk extract the part I care about | |
unzip -p formadv_part2_100.zip 103705_325511_1_20200131.pdf | pdftotext - - | awk 'BEGIN{IGNORECASE=1};/^item 11/,/^item 12/' | |
# build an index | |
for f in *.zip | |
do | |
unzip -l $f | gawk -v f=$f '/pdf/ {print f, $NF}' >> index.txt | |
done | |
# write text to folder | |
cat index.txt | while read -r z f; do | |
unzip -p $z $f | pdftotext -q - - | awk 'BEGIN{IGNORECASE=1};/^item 11/,/^item 12/' > item11/${f%%.*}.txt | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment