python web crawling using scrapy(2)

web crawling examples with python using scrapy

taobaosearch

转载自 http://blog.csdn.net/github_35160620/article/details/53880412

pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
class ThirddemoPipeline(object):
def process_item(self, item, spider):
title = item['title'][0]
link = item['link']
price = item['price'][0]
comment = item['comment'][0]
print('商品名字', title)
print('商品链接', link)
print('商品正常价格', price)
print('商品评论数量', comment)
print('------------------------------\n')
return item

settings.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
BOT_NAME = 'thirdDemo'
SPIDER_MODULES = ['thirdDemo.spiders']
NEWSPIDER_MODULE = 'thirdDemo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'thirdDemo.pipelines.ThirddemoPipeline': 300,
}
COOKIES_ENABLED = False
FEED_EXPORT_ENCODING = 'utf-8' #unicode to utf8

items.py

1
2
3
4
5
6
7
8
9
import scrapy
class ThirddemoItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
price = scrapy.Field()
comment = scrapy.Field()
pass

taobao.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from thirdDemo.items import ThirddemoItem
import urllib
class TaobaoSpider(scrapy.Spider):
name = "taobao"
allowed_domains = ["taobao.com"]
start_urls = ['http://taobao.com/']
def parse(self, response):
key = '小吃'
for i in range(0, 2):
url = 'https://s.taobao.com/search?q=' + str(key) + '&s=' + str(44*i)
print(url)
yield Request(url=url, callback=self.page)
pass
def page(self, response):
body = response.body.decode('utf-8','ignore')
pattam_id = '"nid":"(.*?)"'
all_id = re.compile(pattam_id).findall(body)
# print(all_id)
# print(len(all_id))
for i in range(0, len(all_id)):
this_id = all_id[i]
url = 'https://item.taobao.com/item.htm?id=' + str(this_id)
yield Request(url=url, callback=self.next)
pass
pass
def next(self, response):
item = ThirddemoItem()
# print(response.url)
url = response.url
# 获取商品是属于天猫的、天猫超市的、还是淘宝的。
pattam_url = 'https://(.*?).com'
subdomain = re.compile(pattam_url).findall(url)
# print(subdomain)
# 获取商品的标题
if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名,执行if语句里面的代码
title = response.xpath("//div[@class='tb-detail-hd']/h1/text()").extract()
pass
else:
title = response.xpath("//h3[@class='tb-main-title']/@data-title").extract()
pass
# print(title)
item['title'] = title
# print(item['title'])
# 获取商品的链接网址
item['link'] = url
# 获取商品的正常的价格
if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名,执行if语句里面的代码
pattam_price = '"defaultItemPrice":"(.*?)"'
price = re.compile(pattam_price).findall(response.body.decode('utf-8', 'ignore')) # 天猫
pass
else:
price = response.xpath("//em[@class = 'tb-rmb-num']/text()").extract() # 淘宝
pass
# print(price)
item['price'] = price
# 获取商品的id(用于构造商品评论数量的抓包网址)
if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名,执行if语句里面的代码
pattam_id = 'id=(.*?)&'
pass
else:
# 这种情况(只有上文没有下文)时,使用正则表达式,在最末端用 $ 表示
pattam_id = 'id=(.*?)$'
pass
this_id = re.compile(pattam_id).findall(url)[0]
# print(this_id)
# 构造具有评论数量信息的包的网址
comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(this_id)
# 这个获取网址源代码的代码永远也不会出现错误,因为这个URL的问题,就算URL是错误的,也可以获取到对应错误网址的源代码。
# 所以不需要使用 try 和 except urllib.URLError as e 来包装。
comment_data = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore')
pattam_comment = '"rateTotal":(.*?),"'
comment = re.compile(pattam_comment).findall(comment_data)
# print(comment)
item['comment'] = comment
yield item

获取代理IP

转载自 http://blog.csdn.net/c406495762/article/details/72793480

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from selenium import webdriver
import subprocess as sp
from lxml import etree
import requests
import random
import re
"""
函数说明:获取IP代理
Parameters:
page - 高匿代理页数,默认获取第一页
Returns:
proxys_list - 代理列表
Modify:
2017-05-27
"""
def get_proxys(page = 1):
#requests的Session可以自动保持cookie,不需要自己维护cookie内容
S = requests.Session()
#西祠代理高匿IP地址
target_url = 'http://www.xicidaili.com/nn/%d' % page
#完善的headers
target_headers = {'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer':'http://www.xicidaili.com/nn/',
'Accept-Encoding':'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8',
}
#get请求
target_response = S.get(url = target_url, headers = target_headers)
#utf-8编码
target_response.encoding = 'utf-8'
#获取网页信息
target_html = target_response.text
#获取id为ip_list的table
bf1_ip_list = BeautifulSoup(target_html, 'lxml')
bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id = 'ip_list')), 'lxml')
ip_list_info = bf2_ip_list.table.contents
#存储代理的列表
proxys_list = []
#爬取每个代理信息
for index in range(len(ip_list_info)):
if index % 2 == 1 and index != 1:
dom = etree.HTML(str(ip_list_info[index]))
ip = dom.xpath('//td[2]')
port = dom.xpath('//td[3]')
protocol = dom.xpath('//td[6]')
proxys_list.append(protocol[0].text.lower() + '#' + ip[0].text + '#' + port[0].text)
#返回代理列表
return proxys_list
"""
函数说明:检查代理IP的连通性
Parameters:
ip - 代理的ip地址
lose_time - 匹配丢包数
waste_time - 匹配平均时间
Returns:
average_time - 代理ip平均耗时
Modify:
2017-05-27
"""
def check_ip(ip, lose_time, waste_time):
#命令 -n 要发送的回显请求数 -w 等待每次回复的超时时间(毫秒)
cmd = "ping -n 3 -w 3 %s"
#执行命令
p = sp.Popen(cmd % ip, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
#获得返回结果并解码
out = p.stdout.read().decode("gbk")
#丢包数
lose_time = lose_time.findall(out)
#当匹配到丢失包信息失败,默认为三次请求全部丢包,丢包数lose赋值为3
if len(lose_time) == 0:
lose = 3
else:
lose = int(lose_time[0])
#如果丢包数目大于2个,则认为连接超时,返回平均耗时1000ms
if lose > 2:
#返回False
return 1000
#如果丢包数目小于等于2个,获取平均耗时的时间
else:
#平均时间
average = waste_time.findall(out)
#当匹配耗时时间信息失败,默认三次请求严重超时,返回平均好使1000ms
if len(average) == 0:
return 1000
else:
#
average_time = int(average[0])
#返回平均耗时
return average_time
"""
函数说明:初始化正则表达式
Parameters:
Returns:
lose_time - 匹配丢包数
waste_time - 匹配平均时间
Modify:
2017-05-27
"""
def initpattern():
#匹配丢包数
lose_time = re.compile(u"丢失 = (\d+)", re.IGNORECASE)
#匹配平均时间
waste_time = re.compile(u"平均 = (\d+)ms", re.IGNORECASE)
return lose_time, waste_time
if __name__ == '__main__':
#初始化正则表达式
lose_time, waste_time = initpattern()
#获取IP代理
proxys_list = get_proxys(1)
#如果平均时间超过200ms重新选取ip
while True:
#从100个IP中随机选取一个IP作为代理进行访问
proxy = random.choice(proxys_list)
split_proxy = proxy.split('#')
#获取IP
ip = split_proxy[1]
#检查ip
average_time = check_ip(ip, lose_time, waste_time)
if average_time > 200:
#去掉不能使用的IP
proxys_list.remove(proxy)
print("ip连接超时, 重新获取中!")
if average_time < 200:
break
#去掉已经使用的IP
proxys_list.remove(proxy)
proxy_dict = {split_proxy[0]:split_proxy[1] + ':' + split_proxy[2]}
print("使用代理:", proxy_dict)

zhihuuser

转载自 http://cuiqingcai.com/4380.html

zhihu.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# -*- coding: utf-8 -*-
import json
from scrapy import Spider, Request
from zhihuq.items import UserItem
class ZhihuSpider(Spider):
name = "zhihu"
allowed_domains = ["www.zhihu.com"]
user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
start_user = 'qing-lan-98'
user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
def start_requests(self):
yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
self.parse_follows)
yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_user(self, response):
result = json.loads(response.text)
item = UserItem()
for field in item.fields:
if field in result.keys():
item[field] = result.get(field)
yield item
yield Request(
self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
self.parse_follows)
yield Request(
self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
self.parse_followers)
def parse_follows(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_follows)
def parse_followers(self, response):
results = json.loads(response.text)
if 'data' in results.keys():
for result in results.get('data'):
yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
self.parse_user)
if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
next_page = results.get('paging').get('next')
yield Request(next_page,
self.parse_followers)

items.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from scrapy import Item, Field
class UserItem(Item):
# define the fields for your item here like:
id = Field()
name = Field()
avatar_url = Field()
headline = Field()
description = Field()
url = Field()
url_token = Field()
gender = Field()
cover_url = Field()
type = Field()
badge = Field()
answer_count = Field()
articles_count = Field()
commercial_question_count = Field()
favorite_count = Field()
favorited_count = Field()
follower_count = Field()
following_columns_count = Field()
following_count = Field()
pins_count = Field()
question_count = Field()
thank_from_count = Field()
thank_to_count = Field()
thanked_count = Field()
vote_from_count = Field()
vote_to_count = Field()
voteup_count = Field()
following_favlists_count = Field()
following_question_count = Field()
following_topic_count = Field()
marked_answers_count = Field()
mutual_followees_count = Field()
hosted_live_count = Field()
participated_live_count = Field()
locations = Field()
educations = Field()
employments = Field()

pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
class MongoPipeline(object):
collection_name = 'users'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
self.db[self.collection_name].update({'url_token': item['url_token']}, {'$set': dict(item)}, True)
return item

mysql

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pymysql
class mysqlPipeline(object):
def process_item(self, item, spider):
id = item['id']
name = item['name']
answer_count = item['answer_count']
articles_count = item['articles_count']
favorite_count = item['favorite_count']
favorited_count = item['favorited_count']
follower_count = item['follower_count']
following_count = item['following_count']
following_columns_count= item['following_columns_count']
following_question_count = item['following_question_count']
following_topic_count = item['following_topic_count']
hosted_live_count = item['hosted_live_count']
participated_live_count = item['participated_live_count']
question_count = item['question_count']
thanked_count= item['thanked_count']
marked_answers_count= item['marked_answers_count']
try:
gender = item['gender']
except:
gender = "N"
try:
school = item['educations'][0]['school']['name']
except:
school = "N"
try:
major = item['educations'][0]['major']['name']
except:
major = "N"
try:
job = item['employments'][0]['job']['name']
except:
job = "N"
try:
company = item['employments'][0]['company']['name']
except:
company = "N"
try:
locations = item['locations'][0]['name']
except:
locations = "N"
try:
headline = item['headline']
except:
headline = "N"
# 和本地的newsDB数据库建立连接
conn = pymysql.connect(
host='localhost', # 连接的是本地数据库
user='root', # 自己的mysql用户名
passwd='', # 自己的密码
db='zhihuuser', # 数据库的名字
charset='utf8' # 默认的编码方式:
)
try:
# 使用cursor()方法获取操作游标
cursor = conn.cursor()
# SQL 插入语句
sql = """INSERT INTO user(id,name,gender,school,major,job,company,locations,answer_count,articles_count,favorite_count,favorited_count,follower_count,following_count,following_columns_count,following_question_count,following_topic_count,hosted_live_count,participated_live_count,question_count,thanked_count,marked_answers_count,headline)
VALUES ('%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s')""" % (id,name,gender,school,major,job,company,locations,answer_count,articles_count,favorite_count,favorited_count,follower_count,following_count,following_columns_count,following_question_count,following_topic_count,hosted_live_count,participated_live_count,question_count,thanked_count,marked_answers_count,headline)
# 执行SQL语句
cursor.execute(sql)
# 提交修改
conn.commit()
except:
conn.rollback()
finally:
# 关闭连接
conn.close()
return item

settings.py

1
2
3
4
5
6
7
8
9
10
FEED_EXPORT_ENCODING = 'utf-8'
DOWNLOAD_DELAY = 2
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
ITEM_PIPELINES = {
'zhihuq.pipelines.mysqlPipeline': 300,
}

start

1
scrapy crawl zhihu -s JOBDIR=zant/001

delete duplicate data in mysql examples

1
2
3
4
5
6
1 create table new_table (select * from user group by name,age,nub having count(*)>1);
2 delete from user where (name,age,nub) in
(select * from
(select * from user group by name,age,nub having count(*)>1) as b );
3 insert into userselect name,age,nub from new_table);
4 drop table new_table;

zhihucontent

转载自http://cuiqingcai.com/4607.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by shimeng on 17-6-5
import os
import re
import json
import requests
import html2text
from parse_content import parse
"""
just for study and fun
Talk is cheap
show me your code
"""
class ZhiHu(object):
def __init__(self):
self.request_content = None
def request(self, url, retry_times=10):
header = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
'Host': 'www.zhihu.com'
}
times = 0
while retry_times>0:
times += 1
print ('request %s, times: %d' %(url, times))
try:
self.request_content = requests.get(url, headers=header, timeout=10).content
except Exception as e:
print (e)
retry_times -= 1
else:
return self.request_content
def get_all_answer_content(self, question_id, flag=2):
first_url_format = 'https://www.zhihu.com/api/v4/questions/{}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=3'
first_url = first_url_format.format(question_id)
response = self.request(first_url)
if response:
contents = json.loads(response)
print (contents.get('paging').get('is_end'))
while not contents.get('paging').get('is_end'):
for content in contents.get('data'):
self.parse_content(content, flag)
next_page_url = contents.get('paging').get('next').replace('http', 'https')
contents = json.loads(self.request(next_page_url))
else:
raise ValueError('request failed, quit......')
def get_single_answer_content(self, answer_url, flag=1):
all_content = {}
question_id, answer_id = re.findall('https://www.zhihu.com/question/(\d+)/answer/(\d+)', answer_url)[0]
html_content = self.request(answer_url)
if html_content:
all_content['main_content'] = html_content
else:
raise ValueError('request failed, quit......')
ajax_answer_url = 'https://www.zhihu.com/api/v4/answers/{}'.format(answer_id)
ajax_content = self.request(ajax_answer_url)
if ajax_content:
all_content['ajax_content'] = json.loads(ajax_content)
else:
raise ValueError('request failed, quit......')
self.parse_content(all_content, flag, )
def parse_content(self, content, flag=None):
data = parse(content, flag)
self.transform_to_markdown(data)
def transform_to_markdown(self, data):
content = data['content']
author_name = data['author_name']
answer_id = data['answer_id']
question_id = data['question_id']
question_title = data['question_title']
vote_up_count = data['vote_up_count']
create_time = data['create_time']
file_name = u'%s--%s的回答[%d].md' % (question_title, author_name,answer_id)
folder_name = u'%s' % (question_title)
if not os.path.exists(os.path.join(os.getcwd(),folder_name)):
os.mkdir(folder_name)
os.chdir(folder_name)
f = open(file_name, "w")
f.write("-" * 40 + "\n")
origin_url = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)
f.write("## 本答案原始链接: " + origin_url + "\n")
f.write("### question_title: " + question_title + "\n")
f.write("### Author_Name: " + author_name + "\n")
f.write("### Answer_ID: %d" % answer_id + "\n")
f.write("### Question_ID %d: " % question_id + "\n")
f.write("### VoteCount: %s" % vote_up_count + "\n")
f.write("### Create_Time: " + create_time + "\n")
f.write("-" * 40 + "\n")
text = html2text.html2text(content.decode('utf-8'))
# 标题
r = re.findall(r'\*\*(.*?)\*\*', text, re.S)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
r = re.findall(r'_(.*)_', text)
for i in r:
if i != " ":
text = text.replace(i, i.strip())
text = text.replace('_ _', '')
# 图片
r = re.findall(r'!\[\]\((?:.*?)\)', text)
for i in r:
text = text.replace(i, i + "\n\n")
f.write(text)
f.close()
if __name__ == '__main__':
zhihu = ZhiHu()
url = 'https://www.zhihu.com/question/27069622/answer/214576023'
zhihu.get_single_answer_content(url)
#question_id = '27621722'
#zhihu.get_all_answer_content(question_id)

parse_content.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by shimeng on 17-6-5
import time
from bs4 import BeautifulSoup
def html_template(data):
# api content
html = '''
<html>
<head>
<body>
%s
</body>
</head>
</html>
''' % data
return html
def parse(content, flag=None):
data = {}
if flag == 1:
# single
main_content = content.get('main_content')
ajax_content = content.get('ajax_content')
soup = BeautifulSoup(main_content.decode("utf-8"), "lxml")
answer = soup.find("span", class_="RichText CopyrightRichText-richText")
author_name = ajax_content.get('author').get('name')
answer_id = ajax_content.get('id')
question_id = ajax_content.get('question').get('id')
question_title = ajax_content.get('question').get('title')
vote_up_count = soup.find("meta", itemprop="upvoteCount")["content"]
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ajax_content.get('created_time')))
else:
# all
answer_content = content.get('content')
author_name = content.get('author').get('name')
answer_id = content.get('id')
question_id = content.get('question').get('id')
question_title = content.get('question').get('title')
vote_up_count = content.get('voteup_count')
create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content.get('created_time')))
content = html_template(answer_content)
soup = BeautifulSoup(content, 'lxml')
answer = soup.find("body")
print author_name,answer_id,question_id,question_title,vote_up_count,create_time
# 这里非原创,看了别人的代码,修改了一下
soup.body.extract()
soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
soup.body.append(answer)
img_list = soup.find_all("img", class_="content_image lazy")
for img in img_list:
img["src"] = img["data-actualsrc"]
img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
for img in img_list:
img["src"] = img["data-actualsrc"]
noscript_list = soup.find_all("noscript")
for noscript in noscript_list:
noscript.extract()
data['content'] = soup
data['author_name'] = author_name
data['answer_id'] = answer_id
data['question_id'] = question_id
data['question_title'] = question_title
data['vote_up_count'] = vote_up_count
data['create_time'] = create_time
return data

taobaocomment

using python3.6 转载自http://www.cnblogs.com/dearvee/p/6565688.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import requests
import json
def getCommodityComments(url):
if url[url.find('id=')+14] != '&':
id = url[url.find('id=')+3:url.find('id=')+15]
else:
id = url[url.find('id=')+3:url.find('id=')+14]
url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId='+id+'&currentPageNum=1'
res = requests.get(url)
jc = json.loads(res.text.strip().strip('()'))
max = jc['total']
users = []
comments = []
count = 0
page = 1
print('该商品共有评论'+str(max)+'条,具体如下: loading...')
while count<max:
res = requests.get(url[:-1]+str(page))
page = page + 1
jc = json.loads(res.text.strip().strip('()'))
jc = jc['comments']
for j in jc:
users.append(j['user']['nick'])
comments.append( j['content'])
print(count+1,'>>',users[count],'\n ',comments[count])
count = count + 1
getCommodityComments('https://item.taobao.com/item.htm?id=39595400262&')