-
-
Save congzhangzh/c8c07a7cc3a8f28999ebb82c49e59070 to your computer and use it in GitHub Desktop.
wget with https and cookie login
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# first login and store the cookie | |
wget --post-data='name=USERNAME&pass=PASSWORD&op=Log%20in' --save-cookies=my-cookies.txt --keep-session-cookies "https://private.site.com" > /dev/null 2>&1 | |
# now we can scrape the site (353 pages) | |
for i in {0..353} | |
do | |
echo "grabbing page $i..." | |
wget --cookies=on --keep-session-cookies --load-cookies=my-cookies.txt "https://private.site.com/people?page=$i" > /dev/null 2>&1 | |
# sleep for 1-3 seconds so as not to hammer server, and to keep up the pretence that we are human | |
sleep $[ ( $RANDOM % 3 ) + 1 ]s | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# first login and store the cookie | |
wget --post-data='name=USERNAME&pass=PASSWORD&op=Log%20in' --save-cookies=my-cookies.txt --keep-session-cookies "https://private.site.com" > /dev/null 2>&1 | |
echo "grabbing all people from user list..." | |
wget -k -p -R\*css,\*js -i "user_list" --wait=1 --random-wait --cookies=on --keep-session-cookies --load-cookies=my-cookies.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'set' | |
def parseFiles(users,dir_glob) | |
puts "parsing '#{dir_glob}'" | |
dupes = 0 | |
Dir.glob(dir_glob) do |file| | |
doc = Nokogiri::HTML(open(file)) | |
# find all 'a' tags which have a parent 'div' which has | |
# class 'profile_card_popup_actions_profile' | |
links = doc.xpath('//div[@class = "profile_card_popup_actions_profile"]/a') | |
#puts links.size | |
if links.size != 50 | |
puts "Links size strange: ", links.size | |
end | |
links.each do |a| | |
value = a.attributes['href'].value | |
if value.length < 10 | |
puts value | |
end | |
single_val = Set.new | |
single_val.add value | |
if single_val.subset? users | |
#puts "name '#{value}' already in set" | |
dupes += 1 | |
end | |
users.add value | |
end | |
end | |
puts users.size | |
puts "total dupes = #{dupes}" | |
return users | |
end | |
users = Set.new | |
users = parseFiles(users,"./people*") | |
puts users.size | |
File.open('user_list','w') do |f| | |
users.each do |u| | |
f.puts( "https://private.site.com/#{u}" ) | |
end | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'nokogiri' | |
require 'set' | |
def parseFiles(users,dir_glob) | |
puts "parsing '#{dir_glob}'" | |
Dir.glob(dir_glob) do |file| | |
user = {} | |
doc = Nokogiri::HTML(open(file)) | |
# get name | |
name = doc.xpath('//div[@class = "views-field-title"]/span') | |
user['name'] = name.children.to_s | |
# get title | |
title = doc.xpath('//span[@class = "views-field-field-professional-title-value"]/span') | |
user['title'] = title.children.to_s | |
# get organisation | |
org = doc.xpath('//span[@class = "views-field-field-organization-value"]/span') | |
user['organisation'] = org.children.to_s | |
# get nationality | |
nationality = doc.xpath('//div[@class = "views-field-field-nationality-value"]/span') | |
user['nationality'] = nationality.children.to_s | |
# get emails | |
email = doc.xpath('//div[@class = "views-field-field-primary-email-email"]/span') | |
user['primary_email'] = email.children.to_s | |
email = doc.xpath('//div[@class = "views-field-field-secondary-email-email"]/span') | |
user['secondary_email'] = email.children.to_s | |
# get phones | |
phone = doc.xpath('//div[@class = "views-field-field-work-phone-value"]/span') | |
user['work_phone'] = phone.children.to_s | |
email = doc.xpath('//div[@class = "views-field-field-mobile-phone-value"]/span') | |
user['mobile_phone'] = phone.children.to_s | |
users.add user | |
end | |
return users | |
end | |
users = Set.new | |
users = parseFiles(users,"./private.site.com/users/*") | |
File.open('weforum_contact_database.csv', 'w') do |f| | |
users.each do |u| | |
u.each do |key,value| | |
f.print('"'+value+'"' + ",") | |
end | |
f.print "\n" | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment