Last active
September 7, 2022 07:06
-
-
Save tkizm1/c797253887fc3535a2af2c51a249aced to your computer and use it in GitHub Desktop.
zhihu.py from https://www.zhihu.com/question/37787176
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# 作者:蓝色 | |
# 链接:https://www.zhihu.com/question/37787176/answer/81607754 | |
# 来源:知乎 | |
# 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。 | |
#! /usr/bin/env python | |
from urlparse import urlsplit | |
from os.path import basename | |
import urllib2 | |
import re | |
import requests | |
import os | |
import json | |
url = 'https://www.zhihu.com/question/37787176' | |
if not os.path.exists('images'): | |
os.mkdir("images") | |
page_size = 50 | |
offset = 0 | |
url_content = urllib2.urlopen(url).read() | |
answers = re.findall('meta itemprop="answerCount" content="(.*?)"',url_content) | |
# import pdb | |
# pdb.set_trace() | |
# print answers | |
limits = int(answers[0]) | |
while offset < limits: | |
post_url = "https://www.zhihu.com/api/v4/questions/37787176/answers" | |
params = { | |
'sort_by':'default', | |
'limit':page_size, | |
'offset':offset, | |
'include':'data[*].is_normal,admin_closed_comment,reward_info,is_collapsed,annotation_action,annotation_detail,collapse_reason,is_sticky,collapsed_by,suggest_edit,comment_count,can_comment,content,editable_content,voteup_count,reshipment_settings,comment_permission,created_time,updated_time,review_info,question,excerpt,relationship.is_authorized,is_author,voting,is_thanked,is_nothelp,upvoted_followees;data[*].mark_infos[*].url;data[*].author.follower_count,badge[?(type=best_answerer)].topics' | |
} | |
# data = { | |
# '_xsrf': '', | |
# 'method': 'next', | |
# 'params': params | |
# } | |
header = { | |
'User-Agent': "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:34.0) Gecko/20100101 Firefox/34.0", | |
'Host': "www.zhihu.com", | |
'Referer': url, | |
# authorization 字段从 network->xhr->request-headers 中复制 | |
'authorization':'' | |
} | |
response = requests.get(post_url, params=params, headers=header) | |
# import pdb | |
# pdb.set_trace() | |
answer_list = response.json()["data"] | |
for i in answer_list: | |
# print i | |
img_urls = re.findall('img .*?src="(?=http)(.*?_hd.*?)"', i['content']) | |
for img_url in img_urls: | |
try: | |
img_data = urllib2.urlopen(img_url).read() | |
file_name = basename(urlsplit(img_url)[2]) | |
output = open('images/' + file_name, 'wb') | |
output.write(img_data) | |
output.close() | |
except: | |
pass | |
offset += page_size |
爬出来是IndexError: list index out of range怎么解决呀
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
请问为什么这个程序我跑不了呢?你的post_url我登录上去显示的是错误界面?请问一下是怎么回事呀?answer_list = response.json()["data"]这一行报错KeyError: 'data'