python web crawling(5)

web crawling examples with python using urllib,beautifulsoup,re,requests,

tianmaocomment

转载自http://blog.csdn.net/flysky1991/article/details/74586286

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
import requests
import json
#商品评论的JSON数据
url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=541396117031&spuId=128573071&spuId=128573071&sellerId=2616970884&order=3&currentPage=1&append=⊙&content=1'
req = requests.get(url)
jsondata = req.text[15:]
data = json.loads(jsondata)
#输出页面信息
print('page:',data['paginator']['page'])
#遍历评论信息列表
for i in data["rateList"]:
#输出商品sku信息
print(i['auctionSku'])
#输出评论时间和评论内容
print(i['rateDate'],i['rateContent'])
info = i['appendComment']
#判断是否有追加评论
if info:
print(info['commentTime'])
print(info['content'])
print('======')

jingdongcomment

转载自http://blog.csdn.net/flysky1991/article/details/75040253

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
import urllib.request
import json
import time
import random
import pymysql.cursors
def crawlProductComment(url,page):
#读取原始数据(注意选择gbk编码方式)
html = urllib.request.urlopen(url).read().decode('gbk')
#从原始数据中提取出JSON格式数据(分别以'{'和'}'作为开始和结束标志)
jsondata = html[27:-2]
#print(jsondata)
data = json.loads(jsondata)
#print(data['comments'])
#print(data['comments'][0]['content'])
#遍历商品评论列表
for i in data['comments']:
productName = i['referenceName']
commentTime = i['creationTime']
content = i['content']
#输出商品评论关键信息
print("商品全名:{}".format(productName))
print("用户评论时间:{}".format(commentTime))
print("用户评论内容:{}".format(content))
print("-----------------------------")
'''
数据库操作
'''
#获取数据库链接
connection = pymysql.connect(host = 'localhost',
user = 'root',
password = '123456',
db = 'jd',
charset = 'utf8mb4')
try:
#获取会话指针
with connection.cursor() as cursor:
#创建sql语句
sql = """insert into `jd-mi6` (`productName`,`commentTime`,`content`) values (%s,%s,%s)"""% (productName,commentTime,content)
#执行sql语句
cursor.execute(sql,(productName,commentTime,content))
#提交数据库
connection.commit()
finally:
connection.close()
for i in range(0,10):
print("正在获取第{}页评论数据!".format(i+1))
#小米6评论链接,通过更改page参数的值来循环读取多页评论信息
url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv56668&productId=4099139&score=0&sortType=5&page=' + str(i) +'&pageSize=10&isShadowSku=0&fold=1'
crawlProductComment(url,i)
#设置休眠时间
time.sleep(random.randint(31,33))

qqmusic url

转载自http://www.cnblogs.com/dearvee/p/6602677.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import requests
import urllib
import json
word = '雨蝶'
res1 = requests.get('https://c.y.qq.com/soso/fcgi-bin/client_search_cp?&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w='+word)
jm1 = json.loads(res1.text.strip('callback()[]'))
jm1 = jm1['data']['song']['list']
mids = []
songmids = []
srcs = []
songnames = []
singers = []
for j in jm1:
try:
mids.append(j['media_mid'])
songmids.append(j['songmid'])
songnames.append(j['songname'])
singers.append(j['singer'][0]['name'])
except:
print('wrong')
for n in range(0,len(mids)):
res2 = requests.get('https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?&jsonpCallback=MusicJsonCallback&cid=205361747&songmid='+songmids[n]+'&filename=C400'+mids[n]+'.m4a&guid=6612300644')
jm2 = json.loads(res2.text)
vkey = jm2['data']['items'][0]['vkey']
srcs.append('http://dl.stream.qqmusic.qq.com/C400'+mids[n]+'.m4a?vkey='+vkey+'&guid=6612300644&uin=0&fromtag=66')
print('For '+word+' Start download...')
x = len(srcs)
for m in range(0,x):
print(str(m)+'***** '+songnames[m]+' - '+singers[m]+'.m4a *****'+' Downloading...')
try:
urllib.request.urlretrieve(srcs[m],'d:/music/'+songnames[m]+' - '+singers[m]+'.m4a')
except:
x = x - 1
print('Download wrong~')
print('For ['+word+'] Download complete '+str(x)+'files !')

sogoupicture

转载自http://www.cnblogs.com/dearvee/p/6558571.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import requests
import json
import urllib
def getSogouImag(category,length,path):
n = length
cate = category
imgs = requests.get('http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category='+cate+'&tag=%E5%85%A8%E9%83%A8&start=0&len='+str(n))
jd = json.loads(imgs.text)
jd = jd['all_items']
imgs_url = []
for j in jd:
imgs_url.append(j['bthumbUrl'])
m = 0
for img_url in imgs_url:
print('***** '+str(m)+'.jpg *****'+' Downloading...')
urllib.request.urlretrieve(img_url,path+str(m)+'.jpg')
m = m + 1
print('Download complete!')
getSogouImag('壁纸',2000,'d:/download/壁纸/')

taobaoproduct

转载自http://blog.csdn.net/d1240673769/article/details/74620085

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#爬取taobao商品
import urllib.request
import pymysql
import re
#打开网页,获取网页内容
def url_open(url):
headers=("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
opener=urllib.request.build_opener()
opener.addheaders=[headers]
urllib.request.install_opener(opener)
data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
return data
#将数据存入mysql中
def data_Import(sql):
conn=pymysql.connect(host='127.0.0.1',user='dengjp',password='123456',db='python',charset='utf8')
conn.query(sql)
conn.commit()
conn.close()
if __name__=='__main__':
try:
#定义要查询的商品关键词
keywd="短裙"
keywords=urllib.request.quote(keywd)
#定义要爬取的页数
num=100
for i in range(num):
url="https://s.taobao.com/search?q="+keywords+"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s="+str(i*44)
data=url_open(url)
#定义各个字段正则匹配规则
img_pat='"pic_url":"(//.*?)"'
name_pat='"raw_title":"(.*?)"'
nick_pat='"nick":"(.*?)"'
price_pat='"view_price":"(.*?)"'
fee_pat='"view_fee":"(.*?)"'
sales_pat='"view_sales":"(.*?)"'
comment_pat='"comment_count":"(.*?)"'
city_pat='"item_loc":"(.*?)"'
#查找满足匹配规则的内容,并存在列表中
imgL=re.compile(img_pat).findall(data)
nameL=re.compile(name_pat).findall(data)
nickL=re.compile(nick_pat).findall(data)
priceL=re.compile(price_pat).findall(data)
feeL=re.compile(fee_pat).findall(data)
salesL=re.compile(sales_pat).findall(data)
commentL=re.compile(comment_pat).findall(data)
cityL=re.compile(city_pat).findall(data)
for j in range(len(imgL)):
img="http:"+imgL[j]#商品图片链接
name=nameL[j]#商品名称
nick=nickL[j]#淘宝店铺名称
price=priceL[j]#商品价格
fee=feeL[j]#运费
sales=salesL[j]#商品付款人数
comment=commentL[j]#商品评论数,会存在为空值的情况
if(comment==""):
comment=0
city=cityL[j]#店铺所在城市
print('正在爬取第'+str(i)+"页,第"+str(j)+"个商品信息...")
sql="insert into taobao(name,price,fee,sales,comment,city,nick,img) values('%s','%s','%s','%s','%s','%s','%s','%s')" %(name,price,fee,sales,comment,city,nick,img)
data_Import(sql)
print("爬取完成,且数据已存入数据库")
except Exception as e:
print(str(e))
print("任务完成")

weibopicture

转载自https://github.com/darrenfantasy/image_crawler/blob/master/SinaWeibo/weibo_crawler.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# encoding:utf-8
from selenium import webdriver
import time
import requests
import json
from bs4 import BeautifulSoup
import os
import sys
request_params = {"ajwvr":"6","domain":"100505","domain_op":"100505","feed_type":"0","is_all":"1","is_tag":"0","is_search":"0"}
profile_request_params = {"profile_ftype":"1","is_all":"1"}
weibo_url = "http://weibo.com/"
requset_url = "http://weibo.com/p/aj/v6/mblog/mbloglist?"
cookie_save_file = "cookie.txt"#存cookie的文件名
cookie_update_time_file = "cookie_timestamp.txt"#存cookie时间戳的文件名
image_result_file = "image_result.md"#存图片结果的文件名
# username = 'your weibo accounts'##你的微博账号
# password = 'your weibo password'##你的微博密码
person_site_name = "mrj168"#想爬取的微博号的个性域名 无个性域名则换成: u/+"微博id" 如 u/12345678
weibo_id = "1837498771"#微博id可以在网页端打开微博,显示网页源代码,找到关键词$CONFIG['oid']='1837498771';
page_size = 5#你要爬取的微博的页数
headers = {#User-Agent需要根据每个人的电脑来修改
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded',
'Host':'weibo.com',
'Pragma':'no-cache',
'Referer':'http://weibo.com/u/3278620272?profile_ftype=1&is_all=1',
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
def get_timestamp():#获取当前系统时间戳
try:
tamp = time.time()
timestamp = str(int(tamp))+"000"
print timestamp
return timestamp
except Exception, e:
print e
finally:
pass
def login_weibo_get_cookies():#登录获取cookies
time.sleep(2)
driver.find_element_by_name("username").send_keys(username)##输入用户名
driver.find_element_by_name("password").send_keys(password)##输入密码
driver.find_element_by_xpath("//a[@node-type='submitBtn']").click()##点击登录按钮
cookies = driver.get_cookies()##获取cookies
print cookies
cookie = ""
##将返回的Cookies数组转成微博需要的cookie格式
for x in xrange(len(cookies)):
value = cookies[x]['name']+"="+cookies[x]['value']+";"
cookie = cookie+value
print cookie
return cookie
def save_cookie(cookie):#把cookie存到本地
try:
f= open(cookie_save_file,'w')
f.write(cookie)
f.close()
except Exception, e:
print e
finally:
pass
def get_cookie_from_txt():#从本地文件里读取cookie
f = open(cookie_save_file)
cookie = f.read()
print cookie
return cookie
def save_cookie_update_timestamp(timestamp):#把cookie存到本地
try:
f= open(cookie_update_time_file,'w')
f.write(timestamp)
f.write('\n')
f.close()
except Exception, e:
print e
finally:
pass
def get_cookie_update_time_from_txt():#获取上一次cookie更新时间
try:
f = open(cookie_update_time_file)
lines = f.readlines()
cookie_update_time = lines[0]
print cookie_update_time
return cookie_update_time
except Exception, e:
print e
finally:
pass
def write_image_urls(image_list):
try:
f= open(image_result_file,'a+')
for x in xrange(len(image_list)):
image = image_list[x]
show_image = "![]("+image+")"
f.write(show_image.encode("utf-8"))
f.write('\n')
f.close()
except Exception, e:
print e
finally:
pass
def is_valid_cookie():#判断cookie是否有效
if os.path.isfile(cookie_update_time_file)==False:
return False
else :
f = open(cookie_update_time_file)
lines = f.readlines()
if len(lines) == 0:
return False
else :
last_time_stamp = get_cookie_update_time_from_txt()
if long(get_timestamp()) - long(last_time_stamp) > 6*60*60*1000:
return False
else :
return True
def get_object_weibo_by_weibo_id_and_cookie(weibo_id,person_site_name,cookie,pagebar,page):#通过微博ID和cookie来调取接口
try:
headers["Cookie"] = cookie
headers['Referer'] = weibo_url+person_site_name+"?profile_ftype=1&is_all=1"
request_params["__rnd"] = get_timestamp()
request_params["page"] = page
request_params["pre_page"] = page
request_params["pagebar"] = pagebar
request_params["id"] = "100505"+weibo_id
request_params["script_uri"] = "/"+person_site_name
request_params["pl_name"] = "Pl_Official_MyProfileFeed__22"
request_params["profile_ftype"] = 1
response = requests.get(requset_url,headers=headers,params=request_params)
print response.url
html = response.json()["data"]
return html
except Exception, e:
print e
finally:
pass
def get_object_top_weibo_by_person_site_name_and_cookie(person_site_name,cookie,page):#每一页顶部微博
try:
profile_url = weibo_url+person_site_name+"?"
headers["Cookie"] = cookie
profile_request_params["page"] = page
response = requests.get(profile_url,headers=headers,params=profile_request_params)
print response.url
html = response.text
soup = BeautifulSoup(html,"html.parser")
script_list = soup.find_all("script")
script_size = len(script_list)
print "script_size:"+str(script_size)
tag = 0
for x in xrange(script_size):
if "WB_feed WB_feed_v3 WB_feed_v4" in str(script_list[x]):
tag = x
print "tag:"+str(tag)
# print script_list[script_size-1]
html_start = str(script_list[tag]).find("<div")
html_end = str(script_list[tag]).rfind("div>")
# print str(script_list[tag])[html_start:html_end+4]
return str(str(script_list[tag])[html_start:html_end+4])
except Exception, e:
print e
finally:
pass
def get_img_urls_form_html(html):#从返回的html格式的字符串中获取图片
try:
image_url_list = []
result_html = html.replace("\\","")
soup = BeautifulSoup(result_html,"html.parser")
div_list = soup.find_all("div",'media_box')
print "div_list:"+str(len(div_list))
for x in xrange(len(div_list)):
image_list = div_list[x].find_all("img")
for y in xrange(len(image_list)):
image_url = image_list[y].get("src").replace("\\","")
print image_url
image_url_list.append(image_url.replace("\"",""))
return image_url_list
except Exception, e:
print e
finally:
pass
if(len(sys.argv)==6):
username = sys.argv[1]
password = sys.argv[2]
person_site_name = sys.argv[3]
weibo_id = sys.argv[4]
page_size = int(sys.argv[5])
print "微博账号:"+username
print "微博密码:"+password
print "要爬取的账号的个性域名(无个性域名则输入 u/+微博id ):"+person_site_name
print "要爬取的账号的ID:"+weibo_id
print "爬取页数:"+str(page_size)
else:
print "未按照指定参数输入,请按顺序输入5个指定参数 1.微博账号 2.微博密码 3.要爬取的账号的个性域名(无个性域名则输入 u/+微博id)4.要爬取的账号的ID 5.爬取页数"
sys.exit(0)
result = is_valid_cookie()
print result
if result == False:
driver = webdriver.Chrome("/Users/darrenfantasy/Documents/study/python/image_crawler/SinaWeibo/chromedriver")#打开Chrome
driver.maximize_window()#将浏览器最大化显示
driver.get(weibo_url)#打开微博登录页面
time.sleep(10)#因为加载页面需要时间,所以这里延时10s来确保页面已加载完毕
cookie = login_weibo_get_cookies()
save_cookie(cookie)
save_cookie_update_timestamp(get_timestamp())
else :
cookie = get_cookie_from_txt()
for x in xrange(1,page_size+1):
profile_html = get_object_top_weibo_by_person_site_name_and_cookie(person_site_name,cookie,x)
image_url_list = get_img_urls_form_html(profile_html)
write_image_urls(image_url_list)
for y in xrange(0,2):#有两次下滑加载更多的操作
print "pagebar:"+str(y)
html = get_object_weibo_by_weibo_id_and_cookie(weibo_id,person_site_name,cookie,y,x)
image_url_list = get_img_urls_form_html(html)
write_image_urls(image_url_list)