python web crawling(5)

web crawling examples with python using urllib,beautifulsoup,re,requests,

tianmaocomment

转载自http://blog.csdn.net/flysky1991/article/details/74586286

import requests
import json
#商品评论的JSON数据
url = 'https://rate.tmall.com/list_detail_rate.htm?itemId=541396117031&spuId=128573071&spuId=128573071&sellerId=2616970884&order=3&currentPage=1&append=⊙&content=1'
req = requests.get(url)
jsondata = req.text[15:]
data = json.loads(jsondata)
#输出页面信息
print('page:',data['paginator']['page'])
#遍历评论信息列表
for i in data["rateList"]:
    #输出商品sku信息
    print(i['auctionSku'])
    #输出评论时间和评论内容
    print(i['rateDate'],i['rateContent'])
    info = i['appendComment']
    #判断是否有追加评论
    if info:
        print(info['commentTime'])
        print(info['content'])
    print('======')

jingdongcomment

转载自http://blog.csdn.net/flysky1991/article/details/75040253

# -*- coding: utf-8 -*-
import urllib.request
import json
import time
import random
import pymysql.cursors
def crawlProductComment(url,page):
    #读取原始数据(注意选择gbk编码方式)
    html = urllib.request.urlopen(url).read().decode('gbk')
    #从原始数据中提取出JSON格式数据(分别以'{'和'}'作为开始和结束标志)
    jsondata = html[27:-2]
    #print(jsondata)
    data = json.loads(jsondata)
    #print(data['comments'])
    #print(data['comments'][0]['content'])
    #遍历商品评论列表
    for i in data['comments']:
        productName = i['referenceName']
        commentTime = i['creationTime']
        content = i['content']
        #输出商品评论关键信息
        print("商品全名:{}".format(productName))
        print("用户评论时间:{}".format(commentTime))
        print("用户评论内容:{}".format(content))
        print("-----------------------------")
        '''
        数据库操作
        '''
        #获取数据库链接
        connection  = pymysql.connect(host = 'localhost',
                                  user = 'root',
                                  password = '123456',
                                  db = 'jd',
                                  charset = 'utf8mb4')
        try:
            #获取会话指针
            with connection.cursor() as cursor:
                #创建sql语句
                sql = """insert into `jd-mi6` (`productName`,`commentTime`,`content`) values (%s,%s,%s)"""% (productName,commentTime,content)
                #执行sql语句
                cursor.execute(sql,(productName,commentTime,content))
                #提交数据库
                connection.commit()
        finally:
            connection.close()
for i in range(0,10):
    print("正在获取第{}页评论数据!".format(i+1))
    #小米6评论链接,通过更改page参数的值来循环读取多页评论信息
    url = 'https://club.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv56668&productId=4099139&score=0&sortType=5&page=' + str(i) +'&pageSize=10&isShadowSku=0&fold=1'
    crawlProductComment(url,i)
    #设置休眠时间
    time.sleep(random.randint(31,33))

qqmusic url

转载自http://www.cnblogs.com/dearvee/p/6602677.html

import requests
import urllib
import json
word = '雨蝶'
res1 = requests.get('https://c.y.qq.com/soso/fcgi-bin/client_search_cp?&t=0&aggr=1&cr=1&catZhida=1&lossless=0&flag_qc=0&p=1&n=20&w='+word)
jm1 = json.loads(res1.text.strip('callback()[]'))
jm1 = jm1['data']['song']['list']
mids = []
songmids = []
srcs = []
songnames = []
singers = []
for j in jm1:
    try:
        mids.append(j['media_mid'])
        songmids.append(j['songmid'])
        songnames.append(j['songname'])
        singers.append(j['singer'][0]['name'])
    except:
        print('wrong')
for n in range(0,len(mids)):
    res2 = requests.get('https://c.y.qq.com/base/fcgi-bin/fcg_music_express_mobile3.fcg?&jsonpCallback=MusicJsonCallback&cid=205361747&songmid='+songmids[n]+'&filename=C400'+mids[n]+'.m4a&guid=6612300644')
    jm2 = json.loads(res2.text)
    vkey = jm2['data']['items'][0]['vkey']
    srcs.append('http://dl.stream.qqmusic.qq.com/C400'+mids[n]+'.m4a?vkey='+vkey+'&guid=6612300644&uin=0&fromtag=66')
print('For '+word+' Start download...')    
x = len(srcs)
for m in range(0,x):
    print(str(m)+'***** '+songnames[m]+' - '+singers[m]+'.m4a *****'+' Downloading...')
    try:
        urllib.request.urlretrieve(srcs[m],'d:/music/'+songnames[m]+' - '+singers[m]+'.m4a')
    except:
        x = x - 1
        print('Download wrong~')
print('For ['+word+'] Download complete '+str(x)+'files !')

sogoupicture

转载自http://www.cnblogs.com/dearvee/p/6558571.html

import requests
import json
import urllib
def getSogouImag(category,length,path):
    n = length
    cate = category
    imgs = requests.get('http://pic.sogou.com/pics/channel/getAllRecomPicByTag.jsp?category='+cate+'&tag=%E5%85%A8%E9%83%A8&start=0&len='+str(n))
    jd = json.loads(imgs.text)
    jd = jd['all_items']
    imgs_url = []
    for j in jd:
        imgs_url.append(j['bthumbUrl'])
    m = 0
    for img_url in imgs_url:
            print('***** '+str(m)+'.jpg *****'+'   Downloading...')
            urllib.request.urlretrieve(img_url,path+str(m)+'.jpg')
            m = m + 1
    print('Download complete!')
getSogouImag('壁纸',2000,'d:/download/壁纸/')

taobaoproduct

转载自http://blog.csdn.net/d1240673769/article/details/74620085

#爬取taobao商品
import urllib.request
import pymysql
import re
#打开网页，获取网页内容
def url_open(url):
    headers=("user-agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0")
    opener=urllib.request.build_opener()
    opener.addheaders=[headers]
    urllib.request.install_opener(opener)
    data=urllib.request.urlopen(url).read().decode("utf-8","ignore")
    return data
#将数据存入mysql中
def data_Import(sql):
    conn=pymysql.connect(host='127.0.0.1',user='dengjp',password='123456',db='python',charset='utf8')
    conn.query(sql)
    conn.commit()
    conn.close()
if __name__=='__main__':
    try:
        #定义要查询的商品关键词
        keywd="短裙"
        keywords=urllib.request.quote(keywd)
        #定义要爬取的页数
        num=100
        for i in range(num):
            url="https://s.taobao.com/search?q="+keywords+"&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201856-taobao-item.1&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2C48&s="+str(i*44)
            data=url_open(url)
            #定义各个字段正则匹配规则
            img_pat='"pic_url":"(//.*?)"'
            name_pat='"raw_title":"(.*?)"'
            nick_pat='"nick":"(.*?)"'
            price_pat='"view_price":"(.*?)"'
            fee_pat='"view_fee":"(.*?)"'
            sales_pat='"view_sales":"(.*?)"'
            comment_pat='"comment_count":"(.*?)"'
            city_pat='"item_loc":"(.*?)"'
            #查找满足匹配规则的内容，并存在列表中
            imgL=re.compile(img_pat).findall(data)
            nameL=re.compile(name_pat).findall(data)
            nickL=re.compile(nick_pat).findall(data)
            priceL=re.compile(price_pat).findall(data)
            feeL=re.compile(fee_pat).findall(data)
            salesL=re.compile(sales_pat).findall(data)
            commentL=re.compile(comment_pat).findall(data)
            cityL=re.compile(city_pat).findall(data)
            for j in range(len(imgL)):
                img="http:"+imgL[j]#商品图片链接
                name=nameL[j]#商品名称
                nick=nickL[j]#淘宝店铺名称
                price=priceL[j]#商品价格
                fee=feeL[j]#运费
                sales=salesL[j]#商品付款人数
                comment=commentL[j]#商品评论数，会存在为空值的情况
                if(comment==""):
                    comment=0
                city=cityL[j]#店铺所在城市
                print('正在爬取第'+str(i)+"页，第"+str(j)+"个商品信息...")
                sql="insert into taobao(name,price,fee,sales,comment,city,nick,img) values('%s','%s','%s','%s','%s','%s','%s','%s')" %(name,price,fee,sales,comment,city,nick,img)
                data_Import(sql)
                print("爬取完成，且数据已存入数据库")
    except Exception as e:
        print(str(e))
    print("任务完成")

weibopicture

转载自https://github.com/darrenfantasy/image_crawler/blob/master/SinaWeibo/weibo_crawler.py

# encoding:utf-8
from selenium import webdriver
import time
import requests
import json
from bs4 import BeautifulSoup
import os
import sys
request_params = {"ajwvr":"6","domain":"100505","domain_op":"100505","feed_type":"0","is_all":"1","is_tag":"0","is_search":"0"}
profile_request_params = {"profile_ftype":"1","is_all":"1"}
weibo_url = "http://weibo.com/"
requset_url = "http://weibo.com/p/aj/v6/mblog/mbloglist?"
cookie_save_file = "cookie.txt"#存cookie的文件名
cookie_update_time_file = "cookie_timestamp.txt"#存cookie时间戳的文件名
image_result_file = "image_result.md"#存图片结果的文件名
# username = 'your weibo accounts'##你的微博账号
# password = 'your weibo password'##你的微博密码
person_site_name = "mrj168"#想爬取的微博号的个性域名 无个性域名则换成: u/+"微博id" 如 u/12345678
weibo_id = "1837498771"#微博id可以在网页端打开微博，显示网页源代码，找到关键词$CONFIG['oid']='1837498771'; 
page_size = 5#你要爬取的微博的页数
headers = {#User-Agent需要根据每个人的电脑来修改
        'Accept': '*/*',
		'Accept-Encoding': 'gzip, deflate, sdch',
		'Accept-Language':'zh-CN,zh;q=0.8,en;q=0.6',
		'Cache-Control':'no-cache',
		'Connection':'keep-alive',
		'Content-Type':'application/x-www-form-urlencoded',
		'Host':'weibo.com',
		'Pragma':'no-cache',
		'Referer':'http://weibo.com/u/3278620272?profile_ftype=1&is_all=1',
		'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
		'X-Requested-With':'XMLHttpRequest'
        }
def get_timestamp():#获取当前系统时间戳
    try:
        tamp = time.time()
        timestamp = str(int(tamp))+"000"
        print timestamp
        return timestamp
    except Exception, e:
        print e
    finally:
        pass
def login_weibo_get_cookies():#登录获取cookies
	time.sleep(2)
	driver.find_element_by_name("username").send_keys(username)##输入用户名
	driver.find_element_by_name("password").send_keys(password)##输入密码
	driver.find_element_by_xpath("//a[@node-type='submitBtn']").click()##点击登录按钮
	cookies = driver.get_cookies()##获取cookies
	print cookies
	cookie = ""
	##将返回的Cookies数组转成微博需要的cookie格式
	for x in xrange(len(cookies)):
		value = cookies[x]['name']+"="+cookies[x]['value']+";"
		cookie = cookie+value
	print cookie
	return cookie
def save_cookie(cookie):#把cookie存到本地
    try:
        f= open(cookie_save_file,'w')
        f.write(cookie)
        f.close()
    except Exception, e:
        print e
    finally:
        pass
def get_cookie_from_txt():#从本地文件里读取cookie
	f = open(cookie_save_file)
	cookie = f.read()
	print cookie
	return cookie
def save_cookie_update_timestamp(timestamp):#把cookie存到本地
    try:
        f= open(cookie_update_time_file,'w')
        f.write(timestamp)
        f.write('\n')
        f.close()
    except Exception, e:
        print e
    finally:
        pass
def get_cookie_update_time_from_txt():#获取上一次cookie更新时间
	try:
		f = open(cookie_update_time_file)
		lines = f.readlines()
		cookie_update_time = lines[0]
		print cookie_update_time
		return cookie_update_time
	except Exception, e:
		print e
	finally:
		pass
def write_image_urls(image_list):
    try:
    	f= open(image_result_file,'a+')
        for x in xrange(len(image_list)):
        	image = image_list[x]
        	show_image = "![]("+image+")"
        	f.write(show_image.encode("utf-8"))
        	f.write('\n')
        f.close()
    except Exception, e:
        print e
    finally:
        pass
def is_valid_cookie():#判断cookie是否有效
	if os.path.isfile(cookie_update_time_file)==False:
		return False
	else :
		f = open(cookie_update_time_file)
		lines = f.readlines()
		if len(lines) == 0:
				return False
		else :
			last_time_stamp = get_cookie_update_time_from_txt()
			if long(get_timestamp()) - long(last_time_stamp) > 6*60*60*1000:
				return False
			else :
				return True
def get_object_weibo_by_weibo_id_and_cookie(weibo_id,person_site_name,cookie,pagebar,page):#通过微博ID和cookie来调取接口
	try:
		headers["Cookie"] = cookie
		headers['Referer'] = weibo_url+person_site_name+"?profile_ftype=1&is_all=1"
		request_params["__rnd"] = get_timestamp()
		request_params["page"] = page
		request_params["pre_page"] = page
		request_params["pagebar"] = pagebar
		request_params["id"] = "100505"+weibo_id
		request_params["script_uri"] = "/"+person_site_name
		request_params["pl_name"] = "Pl_Official_MyProfileFeed__22"
		request_params["profile_ftype"] = 1
		response = requests.get(requset_url,headers=headers,params=request_params)
		print response.url
		html =  response.json()["data"]
		return html
	except Exception, e:
		print e
	finally:
		pass
def get_object_top_weibo_by_person_site_name_and_cookie(person_site_name,cookie,page):#每一页顶部微博
	try:
		profile_url = weibo_url+person_site_name+"?"
		headers["Cookie"] = cookie
		profile_request_params["page"] = page
		response = requests.get(profile_url,headers=headers,params=profile_request_params)
		print response.url
		html = response.text
		soup = BeautifulSoup(html,"html.parser")
		script_list = soup.find_all("script")
		script_size = len(script_list)
		print "script_size:"+str(script_size)
		tag = 0
		for x in xrange(script_size):
			if "WB_feed WB_feed_v3 WB_feed_v4" in str(script_list[x]):
				tag = x
		print "tag:"+str(tag)
		# print script_list[script_size-1]
		html_start = str(script_list[tag]).find("<div")
		html_end = str(script_list[tag]).rfind("div>")
		# print str(script_list[tag])[html_start:html_end+4]
		return str(str(script_list[tag])[html_start:html_end+4])
	except Exception, e:
		print e
	finally:
		pass
def get_img_urls_form_html(html):#从返回的html格式的字符串中获取图片
	try:
		image_url_list = []
		result_html = html.replace("\\","")
		soup = BeautifulSoup(result_html,"html.parser")
		div_list = soup.find_all("div",'media_box')
		print "div_list:"+str(len(div_list))
		for x in xrange(len(div_list)):
			image_list = div_list[x].find_all("img")
			for y in xrange(len(image_list)):
				image_url = image_list[y].get("src").replace("\\","")
				print image_url
				image_url_list.append(image_url.replace("\"",""))			
		return image_url_list
	except Exception, e:
		print e
	finally:
		pass
if(len(sys.argv)==6):
	username = sys.argv[1]
	password = sys.argv[2]
	person_site_name = sys.argv[3]
	weibo_id = sys.argv[4]
	page_size = int(sys.argv[5])
	print "微博账号："+username
	print "微博密码："+password
	print "要爬取的账号的个性域名（无个性域名则输入 u/+微博id ）："+person_site_name
	print "要爬取的账号的ID："+weibo_id
	print "爬取页数："+str(page_size)
else:
	print "未按照指定参数输入，请按顺序输入5个指定参数 1.微博账号 2.微博密码 3.要爬取的账号的个性域名（无个性域名则输入 u/+微博id）4.要爬取的账号的ID 5.爬取页数"
	sys.exit(0)
result = is_valid_cookie()
print result
if result == False:
	driver = webdriver.Chrome("/Users/darrenfantasy/Documents/study/python/image_crawler/SinaWeibo/chromedriver")#打开Chrome
	driver.maximize_window()#将浏览器最大化显示
	driver.get(weibo_url)#打开微博登录页面
	time.sleep(10)#因为加载页面需要时间，所以这里延时10s来确保页面已加载完毕
	cookie = login_weibo_get_cookies()
	save_cookie(cookie)
	save_cookie_update_timestamp(get_timestamp())
else :
	cookie = get_cookie_from_txt()
for x in xrange(1,page_size+1):
	profile_html = get_object_top_weibo_by_person_site_name_and_cookie(person_site_name,cookie,x)
	image_url_list = get_img_urls_form_html(profile_html)
	write_image_urls(image_url_list)
	for y in xrange(0,2):#有两次下滑加载更多的操作
		print "pagebar:"+str(y)
		html = get_object_weibo_by_weibo_id_and_cookie(weibo_id,person_site_name,cookie,y,x)
		image_url_list = get_img_urls_form_html(html)
		write_image_urls(image_url_list)