Created
December 7, 2016 09:10
-
-
Save sxlijin/cec47bf76e9ad0011cb9fb43fdd155e4 to your computer and use it in GitHub Desktop.
Scrapes the VOICE survey data submitted by students.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/bash | |
VUNETID="put your vunetid here" | |
VUNETPW="put your vunetid password here" | |
# login | |
curl https://www.sds.vanderbilt.edu/perl/voiceview.pl \ | |
-X POST \ | |
-d VSASM_ASVBlock=425457605464733D246C743F77706A6466776A66782F716D3D2477743F425457604E6675693D246C743F4D4D4542513D2477743F42545760516275693D246C743F7878782F7465742F77626F656673636A6D752F666576307166736D3077706A6466776A66782F716D3D2477743F425457605476634E6675693D246C743F45464742564D553D2477743F42545760544A513D246C743F31423533453132473D2477743F425457604271713D246C743F57504A44463D2477743F425457605477733D246C743F54455432 \ | |
-d VSASM_user="${VUNETID}" \ | |
-d VSASM_pw="${VUNETPW}" \ | |
-d VSASM_Login=Login \ | |
> /dev/null | |
# get information for specific areas | |
function post_to_voice { | |
curl https://www.sds.vanderbilt.edu/perl/voiceview.pl \ | |
-X POST \ | |
-d VSASM_BLOCK=567466734A65666F753D246C743F6D6A6B6A6F743D2477743F4C667A324C3D246C743F3537353D2477743F544A513D246C743F3142353345313247 \ | |
$@ | |
} | |
post_to_voice \ | |
-d VoiceViewUserType=ActiveStudent \ | |
-d TermsAccepted=OK \ | |
> /dev/null | |
echo -n "" > dept-list.txt | |
post_to_voice -d ViewSchool=VUAS | grep -o "VUAS:[A-Z]\+" >> dept-list.txt | |
post_to_voice -d ViewSchool=VUBLR | grep -o "VUBLR:[A-Z]\+" >> dept-list.txt | |
post_to_voice -d ViewSchool=VUENG | grep -o "VUENG:[A-Z]\+" >> dept-list.txt | |
echo -n "" > course-list.txt | |
cat dept-list.txt | \ | |
while IFS=":" read SCHOOL DEPT | |
do | |
AREA=${SCHOOL}:${DEPT} | |
post_to_voice -d ViewSchool=${SCHOOL} -d ViewArea=${AREA} \ | |
| sed -n -e '/ViewCourse/,$p' \ | |
| tail -n +3 \ | |
| sed '/SELECT/q' \ | |
| sed \$d \ | |
| grep -o ">[A-Z0-9]\+" \ | |
| sed "s/>/${AREA}:/" \ | |
>> course-list.txt | |
done | |
echo -n "" > score-links-list.txt | |
cat course-list.txt | \ | |
while IFS=":" read SCHOOL DEPT COURSE | |
do | |
AREA=${SCHOOL}:${DEPT} | |
post_to_voice -d ViewSchool=${SCHOOL} -d ViewArea=${AREA} -d ViewCourse=${COURSE} \ | |
| grep Scores \ | |
| grep -o "A HREF=\"[^\"]\+\"" \ | |
| sed "s/A HREF=//;s/\"//g" \ | |
| sed "s/&/\&/g" \ | |
>> score-links-list.txt | |
done | |
# to adjust this script to scrape data for more than | |
# courses in the fall 2015 semester, the "grep fall2015" | |
# call should be replaced with "cat", and the "cut" | |
# invocation on line 73 should be adjusted | |
grep fall2015 score-links-list.txt | \ | |
while read URL | |
do | |
COURSE_FILE=$(echo "${URL}" \ | |
| grep -o 'ScoreTgt=.*&' \ | |
| cut -d, -f2-6 \ | |
| sed "s/,/-/g" \ | |
| sed "s/$/.txt/") | |
curl "${URL}" \ | |
| grep "valign=center" \ | |
| sed "s/valign=center/\n/g" \ | |
| sed "s/.*nowrap>//;s/ .*//" \ | |
| grep -v "T[RD]" \ | |
| tail -n +2 \ | |
| sed \$d \ | |
| sed "s/.*>//g" \ | |
| paste - - -d"," \ | |
| paste - - - - - - -d":" \ | |
> "downloads/${COURSE_FILE}" | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment