Skip to content

Instantly share code, notes, and snippets.

@congzhangzh
Forked from rjshade/grab_all_pages.sh
Created February 12, 2017 11:59
Show Gist options
  • Save congzhangzh/c8c07a7cc3a8f28999ebb82c49e59070 to your computer and use it in GitHub Desktop.
Save congzhangzh/c8c07a7cc3a8f28999ebb82c49e59070 to your computer and use it in GitHub Desktop.
wget with https and cookie login
#!/bin/bash
# first login and store the cookie
wget --post-data='name=USERNAME&pass=PASSWORD&op=Log%20in' --save-cookies=my-cookies.txt --keep-session-cookies "https://private.site.com" > /dev/null 2>&1
# now we can scrape the site (353 pages)
for i in {0..353}
do
echo "grabbing page $i..."
wget --cookies=on --keep-session-cookies --load-cookies=my-cookies.txt "https://private.site.com/people?page=$i" > /dev/null 2>&1
# sleep for 1-3 seconds so as not to hammer server, and to keep up the pretence that we are human
sleep $[ ( $RANDOM % 3 ) + 1 ]s
done
#!/bin/bash
# first login and store the cookie
wget --post-data='name=USERNAME&pass=PASSWORD&op=Log%20in' --save-cookies=my-cookies.txt --keep-session-cookies "https://private.site.com" > /dev/null 2>&1
echo "grabbing all people from user list..."
wget -k -p -R\*css,\*js -i "user_list" --wait=1 --random-wait --cookies=on --keep-session-cookies --load-cookies=my-cookies.txt
require 'nokogiri'
require 'set'
def parseFiles(users,dir_glob)
puts "parsing '#{dir_glob}'"
dupes = 0
Dir.glob(dir_glob) do |file|
doc = Nokogiri::HTML(open(file))
# find all 'a' tags which have a parent 'div' which has
# class 'profile_card_popup_actions_profile'
links = doc.xpath('//div[@class = "profile_card_popup_actions_profile"]/a')
#puts links.size
if links.size != 50
puts "Links size strange: ", links.size
end
links.each do |a|
value = a.attributes['href'].value
if value.length < 10
puts value
end
single_val = Set.new
single_val.add value
if single_val.subset? users
#puts "name '#{value}' already in set"
dupes += 1
end
users.add value
end
end
puts users.size
puts "total dupes = #{dupes}"
return users
end
users = Set.new
users = parseFiles(users,"./people*")
puts users.size
File.open('user_list','w') do |f|
users.each do |u|
f.puts( "https://private.site.com/#{u}" )
end
end
require 'nokogiri'
require 'set'
def parseFiles(users,dir_glob)
puts "parsing '#{dir_glob}'"
Dir.glob(dir_glob) do |file|
user = {}
doc = Nokogiri::HTML(open(file))
# get name
name = doc.xpath('//div[@class = "views-field-title"]/span')
user['name'] = name.children.to_s
# get title
title = doc.xpath('//span[@class = "views-field-field-professional-title-value"]/span')
user['title'] = title.children.to_s
# get organisation
org = doc.xpath('//span[@class = "views-field-field-organization-value"]/span')
user['organisation'] = org.children.to_s
# get nationality
nationality = doc.xpath('//div[@class = "views-field-field-nationality-value"]/span')
user['nationality'] = nationality.children.to_s
# get emails
email = doc.xpath('//div[@class = "views-field-field-primary-email-email"]/span')
user['primary_email'] = email.children.to_s
email = doc.xpath('//div[@class = "views-field-field-secondary-email-email"]/span')
user['secondary_email'] = email.children.to_s
# get phones
phone = doc.xpath('//div[@class = "views-field-field-work-phone-value"]/span')
user['work_phone'] = phone.children.to_s
email = doc.xpath('//div[@class = "views-field-field-mobile-phone-value"]/span')
user['mobile_phone'] = phone.children.to_s
users.add user
end
return users
end
users = Set.new
users = parseFiles(users,"./private.site.com/users/*")
File.open('weforum_contact_database.csv', 'w') do |f|
users.each do |u|
u.each do |key,value|
f.print('"'+value+'"' + ",")
end
f.print "\n"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment