Skip to content

Instantly share code, notes, and snippets.

@tkizm1
Last active September 7, 2022 07:06
Show Gist options
  • Save tkizm1/c797253887fc3535a2af2c51a249aced to your computer and use it in GitHub Desktop.
Save tkizm1/c797253887fc3535a2af2c51a249aced to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# 作者:蓝色
# 链接:https://www.zhihu.com/question/37787176/answer/81607754
# 来源:知乎
# 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
#! /usr/bin/env python
from urlparse import urlsplit
from os.path import basename
import urllib2
import re
import requests
import os
import json
url = 'https://www.zhihu.com/question/37787176'
if not os.path.exists('images'):
os.mkdir("images")
page_size = 50
offset = 0
url_content = urllib2.urlopen(url).read()
answers = re.findall('meta itemprop="answerCount" content="(.*?)"',url_content)
# import pdb
# pdb.set_trace()
# print answers
limits = int(answers[0])
while offset < limits:
post_url = "https://www.zhihu.com/api/v4/questions/37787176/answers"
params = {
'sort_by':'default',
'limit':page_size,
'offset':offset,
'include':'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics'
}
# data = {
# '_xsrf': '',
# 'method': 'next',
# 'params': params
# }
header = {
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0",
'Host': "www.zhihu.com",
'Referer': url,
# authorization 字段从 network->xhr->request-headers 中复制
'authorization':''
}
response = requests.get(post_url, params=params, headers=header)
# import pdb
# pdb.set_trace()
answer_list = response.json()["data"]
for i in answer_list:
# print i
img_urls = re.findall('img .*?src="(?=http)(.*?_hd.*?)"', i['content'])
for img_url in img_urls:
try:
img_data = urllib2.urlopen(img_url).read()
file_name = basename(urlsplit(img_url)[2])
output = open('images/' + file_name, 'wb')
output.write(img_data)
output.close()
except:
pass
offset += page_size
@tkizm1
Copy link
Author

tkizm1 commented Nov 2, 2017

因为需要token
image

@Playuyl
Copy link

Playuyl commented Sep 7, 2022

爬出来是IndexError: list index out of range怎么解决呀

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment