python web crawling(2)

web crawling examples with python using urllib,beautifulsoup,re,requests,

web crawling shuaia.net images using python3.6

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
import requests
import os
import time
if __name__ == '__main__':
list_url = []
for num in range(1,3):
	if num == 1:
		url = 'http://www.shuaia.net/index.html'
	else:
		url = 'http://www.shuaia.net/index_%d.html' % num
	headers = {
			"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
	}
	req = requests.get(url = url,headers = headers)
	req.encoding = 'utf-8'
	html = req.text
	bf = BeautifulSoup(html, 'lxml')
	targets_url = bf.find_all(class_='item-img')
	
	for each in targets_url:
		list_url.append(each.img.get('alt') + '=' + each.get('href'))
print('连接采集完成')
for each_img in list_url:
	img_info = each_img.split('=')
	target_url = img_info[1]
	filename = img_info[0] + '.jpg'
	print('下载：' + filename)
	headers = {
		"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
	}
	img_req = requests.get(url = target_url,headers = headers)
	img_req.encoding = 'utf-8'
	img_html = img_req.text
	img_bf_1 = BeautifulSoup(img_html, 'lxml')
	img_url = img_bf_1.find_all('div', class_='wr-single-content-list')
	img_bf_2 = BeautifulSoup(str(img_url), 'lxml')
	img_url = 'http://www.shuaia.net' + img_bf_2.div.img.get('src')
	if 'images' not in os.listdir():
		os.makedirs('images')
	urlretrieve(url = img_url,filename = 'images/' + filename)
	time.sleep(1)
print('下载完成！')

repost

web crawling douban.com/top250 using python3.6

from bs4 import BeautifulSoup
import requests
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}##浏览器请求头（大部分网站没有 这个请求头会报错、请务必加上哦）
all_url = 'https://movie.douban.com/top250?start='  ##开始的URL地址
num = 0
while num < 250:
      all_url = all_url + str(num)
      num = num +25 
      start_html = requests.get(all_url,  headers=headers) 
      soup = BeautifulSoup(start_html.text, 'lxml')
      dmn = soup.find_all('div', class_='hd')
      dmr = soup.find_all('div',class_='bd')
      dbm =[]
      for item in dmn :
          dbm.append(item.find_all('span')[0].string)
      for item in dmr :
          dbm.append(item.find_all('span')[1].string)
      print(dbm)
      if num > 250:
          break

原创文章转载请注明出处

crawling image.baidu.com using python3.6

import requests
import os
def getManyPages(keyword,pages):
    params=[]
    for i in range(30,30*pages+30,30):
        params.append({
                      'tn': 'resultjson_com',
                      'ipn': 'rj',
                      'ct': 201326592,
                      'is': '',
                      'fp': 'result',
                      'queryWord': keyword,
                      'cl': 2,
                      'lm': -1,
                      'ie': 'utf-8',
                      'oe': 'utf-8',
                      'adpicid': '',
                      'st': -1,
                      'z': '',
                      'ic': 0,
                      'word': keyword,
                      's': '',
                      'se': '',
                      'tab': '',
                      'width': '',
                      'height': '',
                      'face': 0,
                      'istype': 2,
                      'qc': '',
                      'nc': 1,
                      'fr': '',
                      'pn': i,
                      'rn': 30,
                      'gsm': '1e',
                      '1488942260214': ''
                  })
    url = 'https://image.baidu.com/search/acjson'
    urls = []
    for i in params:
        urls.append(requests.get(url,params=i).json().get('data'))
    return urls
def getImg(dataList, localPath):
    if not os.path.exists(localPath):  # 新建文件夹
        os.mkdir(localPath)
    x = 0
    for list in dataList:
        for i in list:
            if i.get('thumbURL') != None:
                print('正在下载：%s' % i.get('thumbURL'))
                ir = requests.get(i.get('thumbURL'))
                open(localPath + '%d.jpg' % x, 'wb').write(ir.content)
                x += 1
            else:
                print('图片链接不存在')
if __name__ == '__main__':
    dataList = getManyPages('王尼玛',10)  # 参数1:关键字，参数2:要下载的页数
    getImg(dataList,'d:/NBA录像/') # 参数2:指定保存的路径

repost

crawl douban.com/tags/ using python3.6

import requests
import time
import json
#from bs4 import BeautifulSoup
import csv
headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
url = 'https://movie.douban.com/j/new_search_subjects?'
params ={'sort':'T','range':'0,10','tags':'','start':'0'}
start_html = requests.post(url, data=params, headers=headers)
htmlcontent=start_html.content.decode('utf-8')
data = json.loads(htmlcontent.strip())
title_n = data['data']
num = 0
title_nmb = []
while num < 2:
     title_nm = {}
     title_nm['名称'] = title_n[num]['title']
     title_nm['评分'] = title_n[num]['rate']
     num = num + 1        
     title_nmb.append(title_nm)
     time.sleep(5) 
     if num > 2:
         break
print(title_nmb)
csvfile = open('11.csv', 'w',newline='')
keys=title_nmb[0].keys()
writer = csv.writer(csvfile)
writer.writerow(keys)#将属性列表写入csv中
for row in title_nmb:
    writer.writerow(row.values())
csvfile.close()

python3.6爬取json数据，输出excel

import requests,urllib
import json
import xlwt
#from bs4 import BeautifulSoup
def getDate():
    page = urllib.request.Request("http://contests.acmicpc.info/contests.json")
    response = urllib.request.urlopen(page)
    return response.read().decode('utf-8')
def getJson(s):
    j = json.loads(s)
    return j
def writeExcel(header, v):
    wb = xlwt.Workbook()
    ws = wb.add_sheet('Sheet1')
    for c in range(len(header)):
        ws.write(0, c, header[c])
        for r in range(len(v)):
            ws.write(r+1, c, v[r][header[c]])
    wb.save('Recent contests.xls')
header = ['id','oj', 'name', 'link', 'start_time', 'week', 'access']
writeExcel(header, getJson(getDate()))