python web crawling(3)

web crawling examples with python using urllib,beautifulsoup,re,requests,

Python scraping 抓取淘宝照片 python3.6

转载

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
import urllib,re,os,datetime
from selenium import webdriver
class Spider:
def __init__(self):
self.page=1
self.dirName='MMSpider'
#这是一些配置 关闭loadimages可以加快速度 但是第二页的图片就不能获取了打开(默认)
cap = webdriver.DesiredCapabilities.PHANTOMJS
cap["phantomjs.page.settings.resourceTimeout"] = 1000
#cap["phantomjs.page.settings.loadImages"] = False
#cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True
self.driver = webdriver.PhantomJS(desired_capabilities=cap)
def getContent(self,maxPage):
for index in range(1,maxPage+1):
self.LoadPageContent(index)
#获取页面内容提取
def LoadPageContent(self,page):
#记录开始时间
begin_time=datetime.datetime.now()
url="https://mm.taobao.com/json/request_top_list.htm?page="+str(page)
self.page+=1;
USER_AGENT='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36'
headers = {'User-Agent':USER_AGENT }
request=urllib.request.Request(url,headers=headers)
response=urllib.request.urlopen(request)
#正则获取
pattern_link=re.compile(r'<div.*?class="pic-word">.*?<img src="(.*?)".*?'
r'<a.*?class="lady-name".*?href="(.*?)".*?>(.*?)</a>.*?'
r'<em>.*?<strong>(.*?)</strong>.*?'
r'<span>(.*?)</span>'
,re.S)
items=re.findall(pattern_link,response.read().decode('gbk'))
for item in items:
#头像,个人详情,名字,年龄,地区
print (u'发现一位MM 名字叫%s 年龄%s 坐标%s'%(item[2],item[3],item[4]))
print (u'%s的个人主页是 %s'%(item[2],item[1]))
print (u'继续获取详情页面数据...')
#详情页面
detailPage=item[1]
name=item[2]
self.getDetailPage(detailPage,name,begin_time)
def getDetailPage(self,url,name,begin_time):
url='http:'+url
self.driver.get(url)
base_msg=self.driver.find_elements_by_xpath('//div[@class="mm-p-info mm-p-base-info"]/ul/li')
brief=''
for item in base_msg:
print (item.text)
brief+=item.text+'\n'
#保存个人信息
icon_url=self.driver.find_element_by_xpath('//div[@class="mm-p-model-info-left-top"]//img')
icon_url=icon_url.get_attribute('src')
dir=self.dirName+'/'+name
self.mkdir(dir)
#保存头像
try:
self.saveIcon(icon_url,dir,name)
except Exception as e:
print (u'保存头像失败 %s'%e.message)
#开始跳转相册列表
images_url=self.driver.find_element_by_xpath('//ul[@class="mm-p-menu"]//a')
images_url=images_url.get_attribute('href')
try:
self.getAllImage(images_url,name)
except Exception as e:
print (u'获取所有相册异常 %s'%e.message)
end_time=datetime.datetime.now()
#保存个人信息 以及耗时
try:self.saveBrief(brief,dir,name,end_time-begin_time)
except Exception as e:
print (u'保存个人信息失败 %s'%e.message)
#获取所有图片
def getAllImage(self,images_url,name):
self.driver.get(images_url)
#只获取第一个相册
photos=self.driver.find_element_by_xpath('//div[@class="mm-photo-cell-middle"]//h4/a')
photos_url=photos.get_attribute('href')
#进入相册页面获取相册内容
self.driver.get(photos_url)
images_all=self.driver.find_elements_by_xpath('//div[@id="mm-photoimg-area"]/a/img')
self.saveImgs(images_all,name)
def saveImgs(self,images,name):
index=1
print (u'%s 的相册有%s张照片, 尝试全部下载....'%(name,len(images)))
for imageUrl in images:
splitPath = imageUrl.get_attribute('src').split('.')
fTail = splitPath.pop()
if len(fTail) > 3:
fTail = "jpg"
fileName = self.dirName+'/'+name +'/'+name+ str(index) + "." + fTail
print (u'下载照片地址%s '%fileName)
self.saveImg(imageUrl.get_attribute('src'),fileName)
index+=1
def saveIcon(self,url,dir,name):
print (u'头像地址%s %s '%(url,name))
splitPath=url.split('.')
fTail=splitPath.pop()
fileName=dir+'/'+name+'.'+fTail
print (fileName)
self.saveImg(url,fileName)
#写入图片
def saveImg(self,imageUrl,fileName):
print (imageUrl)
u=urllib.request.urlopen(imageUrl)
data=u.read()
f=open(fileName,'wb')
f.write(data)
f.close()
#保存个人信息
def saveBrief(self,content,dir,name,speed_time):
speed_time=u'当前MM耗时 '+str(speed_time)
content=content+'\n'+speed_time
fileName=dir+'/'+name+'.txt'
f=open(fileName,'wb+')
print (u'正在获取%s的个人信息保存到%s'%(name,fileName))
f.write(content.encode('utf-8'))
#创建目录
def mkdir(self,path):
path=path.strip()
print (u'创建目录%s'%path)
if os.path.exists(path):
return False
else:
os.makedirs(path)
return True
spider=Spider()
#获取前5页
spider.getContent(5)

using selenium and PhantomJS

1
2
3
4
5
6
7
8
9
10
11
from selenium import webdriver
browser = webdriver.PhantomJS('D:\phantomjs.exe') #浏览器初始化;Win下需要设置phantomjs路径,linux下置空即可
url = 'http://www.zhidaow.com' # 设置访问路径
browser.get(url) # 打开网页
title = browser.find_elements_by_xpath('//h2') # 用xpath获取元素
for t in title: # 遍历输出
print (t.text)# 输出其中文本
b=t.get_attribute('class')
print ("text is %s" % b) # 输出属性值

toutiao.com images

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# coding=utf-8
import json
import os
import re
import urllib
from urllib import request
'''
Python3.X 动态页面爬取(逆向解析)实例
爬取今日头条关键词搜索结果的所有详细页面大图片并按照关键词及文章标题分类存储图片
'''
class CrawlOptAnalysis(object):
def __init__(self, search_word="美女"):
self.search_word = search_word
self.headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Host': 'www.toutiao.com',
'Referer': 'http://www.toutiao.com/search/?keyword={0}'.format(urllib.parse.quote(self.search_word)),
'Accept': 'application/json, text/javascript',
}
def _crawl_data(self, offset):
'''
模拟依据传入 offset 进行分段式上拉加载更多 item 数据爬取
'''
url = 'http://www.toutiao.com/search_content/?offset={0}&format=json&keyword={1}&autoload=true&count=20&cur_tab=1'.format(offset, urllib.parse.quote(self.search_word))
print(url)
try:
with request.urlopen(url, timeout=10) as response:
content = response.read()
except Exception as e:
content = None
print('crawl data exception.'+str(e))
return content
def _parse_data(self, content):
'''
解析每次上拉加载更多爬取的 item 数据及每个 item 点进去详情页所有大图下载链接
[
{'article_title':XXX, 'article_image_detail':['url1', 'url2', 'url3']},
{'article_title':XXX, 'article_image_detail':['url1', 'url2', 'url3']}
]
'''
if content is None:
return None
try:
data_list = json.loads(content)['data']
print(data_list)
result_list = list()
for item in data_list:
result_dict = {'article_title': item['title']}
url_list = list()
for url in item['image_detail']:
url_list.append(url['url'])
result_dict['article_image_detail'] = url_list
result_list.append(result_dict)
except Exception as e:
print('parse data exception.'+str(e))
return result_list
def _save_picture(self, page_title, url):
'''
把爬取的所有大图下载下来
下载目录为./output/search_word/page_title/image_file
'''
if url is None or page_title is None:
print('save picture params is None!')
return
reg_str = r"[\/\\\:\*\?\"\<\>\|]" #For Windows File filter: '/\:*?"<>|'
page_title = re.sub(reg_str, "", page_title)
save_dir = './output/{0}/{1}/'.format(self.search_word, page_title)
if os.path.exists(save_dir) is False:
os.makedirs(save_dir)
save_file = save_dir + url.split("/")[-1] + '.png'
if os.path.exists(save_file):
return
try:
with request.urlopen(url, timeout=30) as response, open(save_file, 'wb') as f_save:
f_save.write(response.read())
print('Image is saved! search_word={0}, page_title={1}, save_file={2}'.format(self.search_word, page_title, save_file))
except Exception as e:
print('save picture exception.'+str(e))
def go(self):
offset = 0
while True:
page_list = self._parse_data(self._crawl_data(offset))
if page_list is None or len(page_list) <= 0:
break
try:
for page in page_list:
article_title = page['article_title']
for img in page['article_image_detail']:
self._save_picture(article_title, img)
except Exception as e:
print('go exception.'+str(e))
finally:
offset += 20
if __name__ == '__main__':
#模拟今日头条搜索关键词爬取正文大图
CrawlOptAnalysis("美女").go()
CrawlOptAnalysis("旅游").go()
CrawlOptAnalysis("风景").go()

csdn ranks

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import re
import codecs
def crawl(driver, url):
driver.get(url)
print("CSDN排行榜: \t文章周排行 \t 浏览总量")
infofile = codecs.open("Result_csdn.txt", 'a', 'utf-8')
print('爬取信息如下:\n')
content = driver.find_elements_by_xpath('/html/body/div[5]/div[1]/ul/li')
# print(content)
for item in content:
result = item.find_element_by_tag_name('em').text.split('.')[0]\
+ ':\t'\
+ item.find_element_by_tag_name('a').text\
+ '\t\t\t'\
+ item.find_element_by_tag_name('b').text + '\n'
print(result)
infofile.write(result)
# print(item.find_element_by_tag_name('em').text+':/t'+item.find_element_by_tag_name('lebel').text)
# content = driver.find_elements_by_xpath("//div[@class='item']")
# for tag in content:
# print(tag.text)
# print(driver.find_element_by_xpath('/html/body/div[5]/div[1]'))
#
# content = driver.find_elements_by_xpath("//h3[text()='文章周排行']//li")
# for tag in content:
# print (tag.text)
# infofile.write(tag.text + "\r\n")
# print('')
# elem = driver.find_elements_by_tag_name('li')
# for tag in elem:
# print(tag.find_element_by_tag_name('//*[@id="content"]/div/div[1]/em').text)
# for tag in elem:
# print(driver.find_element_by_tag_name("//*[@id='content']/div/div[%d]/ol/li[10]/div/div[1]/em)" % index))
# print(tag.find_element_by_tag_name('em'))
# print('tag hi',index)
# index = index+1
# driver.close()
if __name__ == '__main__':
print('this is main function:')
URL = 'http://blog.csdn.net/ranking.html'
Driver = webdriver.PhantomJS()
# Driver = webdriver.Chrome()
crawl(Driver, URL)
Driver.close()

douban movie

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# coding=utf-8
from selenium import webdriver
import time
import codecs
driver = webdriver.PhantomJS(executable_path="D:/phantomjs.exe")
driver.get("https://movie.douban.com/typerank?type_name=剧情&type=11&interval_id=100:90&action=")
infofile = codecs.open("dmb1.txt", 'a', 'utf-8')
# 向下滚动10000像素
js = "document.body.scrollTop=10000"
#js="var q=document.documentElement.scrollTop=10000"
# 执行JS语句
driver.execute_script(js)
time.sleep(10)
content = driver.find_elements_by_xpath('//*[@id="content"]/div/div[1]/div[6]/div')
time.sleep(3)
dbm =[]
for item in content:
result = item.find_element_by_class_name('movie-name-text').text\
+ ','\
+ item.find_element_by_class_name('movie-misc').text\
+ ','\
+ item.find_element_by_class_name('rating_num').text\
+ ','\
+ item.find_element_by_class_name('comment-num').text
dbm.append(result)
print(dbm)
infofile.write(str(dbm))
#查看页面快照
driver.save_screenshot("newdouban.png")
driver.quit()