python web crawling(2)

web crawling examples with python using urllib,beautifulsoup,re,requests,

web crawling shuaia.net images using python3.6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
if __name__ == '__main__':
list_url = []
for num in range(1,3):
if num == 1:
url = 'http://www.shuaia.net/index.html'
else:
url = 'http://www.shuaia.net/index_%d.html' % num
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
req = requests.get(url = url,headers = headers)
req.encoding = 'utf-8'
html = req.text
bf = BeautifulSoup(html, 'lxml')
targets_url = bf.find_all(class_='item-img')
for each in targets_url:
list_url.append(each.img.get('alt') + '=' + each.get('href'))
print('连接采集完成')
for each_img in list_url:
img_info = each_img.split('=')
target_url = img_info[1]
filename = img_info[0] + '.jpg'
print('下载:' + filename)
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
img_req = requests.get(url = target_url,headers = headers)
img_req.encoding = 'utf-8'
img_html = img_req.text
img_bf_1 = BeautifulSoup(img_html, 'lxml')
img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
if 'images' not in os.listdir():
os.makedirs('images')
urlretrieve(url = img_url,filename = 'images/' + filename)
time.sleep(1)
print('下载完成!')

repost

web crawling douban.com/top250 using python3.6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头(大部分网站没有 这个请求头会报错、请务必加上哦)
all_url = 'https://movie.douban.com/top250?start=' ##开始的URL地址
num = 0
while num < 250:
all_url = all_url + str(num)
num = num +25
start_html = requests.get(all_url, headers=headers)
soup = BeautifulSoup(start_html.text, 'lxml')
dmn = soup.find_all('div', class_='hd')
dmr = soup.find_all('div',class_='bd')
dbm =[]
for item in dmn :
dbm.append(item.find_all('span')[0].string)
for item in dmr :
dbm.append(item.find_all('span')[1].string)
print(dbm)
if num > 250:
break

原创文章转载请注明出处

crawling image.baidu.com using python3.6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import requests
import os
def getManyPages(keyword,pages):
params=[]
for i in range(30,30*pages+30,30):
params.append({
'tn': 'resultjson_com',
'ipn': 'rj',
'ct': 201326592,
'is': '',
'fp': 'result',
'queryWord': keyword,
'cl': 2,
'lm': -1,
'ie': 'utf-8',
'oe': 'utf-8',
'adpicid': '',
'st': -1,
'z': '',
'ic': 0,
'word': keyword,
's': '',
'se': '',
'tab': '',
'width': '',
'height': '',
'face': 0,
'istype': 2,
'qc': '',
'nc': 1,
'fr': '',
'pn': i,
'rn': 30,
'gsm': '1e',
'1488942260214': ''
})
url = 'https://image.baidu.com/search/acjson'
urls = []
for i in params:
urls.append(requests.get(url,params=i).json().get('data'))
return urls
def getImg(dataList, localPath):
if not os.path.exists(localPath): # 新建文件夹
os.mkdir(localPath)
x = 0
for list in dataList:
for i in list:
if i.get('thumbURL') != None:
print('正在下载:%s' % i.get('thumbURL'))
ir = requests.get(i.get('thumbURL'))
open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
x += 1
else:
print('图片链接不存在')
if __name__ == '__main__':
dataList = getManyPages('王尼玛',10) # 参数1:关键字,参数2:要下载的页数
getImg(dataList,'d:/NBA录像/') # 参数2:指定保存的路径

repost

crawl douban.com/tags/ using python3.6

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import requests
import time
import json
#from bs4 import BeautifulSoup
import csv
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
url = 'https://movie.douban.com/j/new_search_subjects?'
params ={'sort':'T','range':'0,10','tags':'','start':'0'}
start_html = requests.post(url, data=params, headers=headers)
htmlcontent=start_html.content.decode('utf-8')
data = json.loads(htmlcontent.strip())
title_n = data['data']
num = 0
title_nmb = []
while num < 2:
title_nm = {}
title_nm['名称'] = title_n[num]['title']
title_nm['评分'] = title_n[num]['rate']
num = num + 1
title_nmb.append(title_nm)
time.sleep(5)
if num > 2:
break
print(title_nmb)
csvfile = open('11.csv', 'w',newline='')
keys=title_nmb[0].keys()
writer = csv.writer(csvfile)
writer.writerow(keys)#将属性列表写入csv中
for row in title_nmb:
writer.writerow(row.values())
csvfile.close()

python3.6爬取json数据,输出excel

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import requests,urllib
import json
import xlwt
#from bs4 import BeautifulSoup
def getDate():
page = urllib.request.Request("http://contests.acmicpc.info/contests.json")
response = urllib.request.urlopen(page)
return response.read().decode('utf-8')
def getJson(s):
j = json.loads(s)
return j
def writeExcel(header, v):
wb = xlwt.Workbook()
ws = wb.add_sheet('Sheet1')
for c in range(len(header)):
ws.write(0, c, header[c])
for r in range(len(v)):
ws.write(r+1, c, v[r][header[c]])
wb.save('Recent contests.xls')
header = ['id','oj', 'name', 'link', 'start_time', 'week', 'access']
writeExcel(header, getJson(getDate()))