python web crawling using scrapy(2)

web crawling examples with python using scrapy

taobaosearch

转载自 http://blog.csdn.net/github_35160620/article/details/53880412

pipelines.py

class ThirddemoPipeline(object):
    def process_item(self, item, spider):
        title = item['title'][0]
        link = item['link']
        price = item['price'][0]
        comment = item['comment'][0]
        print('商品名字', title)
        print('商品链接', link)
        print('商品正常价格', price)
        print('商品评论数量', comment)
        print('------------------------------\n')
        return item

settings.py

BOT_NAME = 'thirdDemo'
SPIDER_MODULES = ['thirdDemo.spiders']
NEWSPIDER_MODULE = 'thirdDemo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'thirdDemo.pipelines.ThirddemoPipeline': 300,
}
COOKIES_ENABLED = False
FEED_EXPORT_ENCODING = 'utf-8' #unicode to utf8

items.py

import scrapy
class ThirddemoItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    link = scrapy.Field()
    price = scrapy.Field()
    comment = scrapy.Field()
    pass

taobao.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from thirdDemo.items import ThirddemoItem
import urllib
class TaobaoSpider(scrapy.Spider):
    name = "taobao"
    allowed_domains = ["taobao.com"]
    start_urls = ['http://taobao.com/']
    def parse(self, response):
        key = '小吃'
        for i in range(0, 2):
            url = 'https://s.taobao.com/search?q=' + str(key) + '&s=' + str(44*i)
            print(url)
            yield Request(url=url, callback=self.page)
        pass
    def page(self, response):
        body = response.body.decode('utf-8','ignore')
        pattam_id = '"nid":"(.*?)"'
        all_id = re.compile(pattam_id).findall(body)
        # print(all_id)
        # print(len(all_id))
        for i in range(0, len(all_id)):
            this_id = all_id[i]
            url = 'https://item.taobao.com/item.htm?id=' + str(this_id)
            yield Request(url=url, callback=self.next)
            pass
        pass
    def next(self, response):
        item = ThirddemoItem()
        # print(response.url)
        url = response.url
        # 获取商品是属于天猫的、天猫超市的、还是淘宝的。
        pattam_url = 'https://(.*?).com'
        subdomain = re.compile(pattam_url).findall(url)
        # print(subdomain)
        # 获取商品的标题
        if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名，执行if语句里面的代码
            title = response.xpath("//div[@class='tb-detail-hd']/h1/text()").extract()
            pass
        else:
            title = response.xpath("//h3[@class='tb-main-title']/@data-title").extract()
            pass
        # print(title)
        item['title'] = title
        # print(item['title'])
        # 获取商品的链接网址
        item['link'] = url
        # 获取商品的正常的价格
        if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名，执行if语句里面的代码
            pattam_price = '"defaultItemPrice":"(.*?)"'
            price = re.compile(pattam_price).findall(response.body.decode('utf-8', 'ignore')) # 天猫
            pass
        else:
            price = response.xpath("//em[@class = 'tb-rmb-num']/text()").extract() # 淘宝
            pass
        # print(price)
        item['price'] = price
        # 获取商品的id（用于构造商品评论数量的抓包网址）
        if subdomain[0] != 'item.taobao': # 如果不属于淘宝子域名，执行if语句里面的代码
            pattam_id = 'id=(.*?)&'
            pass
        else:
            # 这种情况（只有上文没有下文）时，使用正则表达式，在最末端用 $ 表示
            pattam_id = 'id=(.*?)$'
            pass
        this_id = re.compile(pattam_id).findall(url)[0]
        # print(this_id)
        # 构造具有评论数量信息的包的网址
        comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(this_id)
        # 这个获取网址源代码的代码永远也不会出现错误，因为这个URL的问题，就算URL是错误的，也可以获取到对应错误网址的源代码。
        # 所以不需要使用 try 和 except urllib.URLError as e 来包装。
        comment_data = urllib.request.urlopen(comment_url).read().decode('utf-8', 'ignore')
        pattam_comment = '"rateTotal":(.*?),"'
        comment = re.compile(pattam_comment).findall(comment_data)
        # print(comment)
        item['comment'] = comment
        yield item

获取代理IP

转载自 http://blog.csdn.net/c406495762/article/details/72793480

# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
from selenium import webdriver
import subprocess as sp
from lxml import etree
import requests
import random
import re
"""
函数说明:获取IP代理
Parameters:
	page - 高匿代理页数,默认获取第一页
Returns:
	proxys_list - 代理列表
Modify:
	2017-05-27
"""
def get_proxys(page = 1):
	#requests的Session可以自动保持cookie,不需要自己维护cookie内容
	S = requests.Session()
	#西祠代理高匿IP地址
	target_url = 'http://www.xicidaili.com/nn/%d' % page
	#完善的headers
	target_headers = {'Upgrade-Insecure-Requests':'1',
		'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
		'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
		'Referer':'http://www.xicidaili.com/nn/',
		'Accept-Encoding':'gzip, deflate, sdch',
		'Accept-Language':'zh-CN,zh;q=0.8',
	}
	#get请求
	target_response = S.get(url = target_url, headers = target_headers)
	#utf-8编码
	target_response.encoding = 'utf-8'
	#获取网页信息
	target_html = target_response.text
	#获取id为ip_list的table
	bf1_ip_list = BeautifulSoup(target_html, 'lxml')
	bf2_ip_list = BeautifulSoup(str(bf1_ip_list.find_all(id = 'ip_list')), 'lxml')
	ip_list_info = bf2_ip_list.table.contents
	#存储代理的列表
	proxys_list = []
	#爬取每个代理信息
	for index in range(len(ip_list_info)):
		if index % 2 == 1 and index != 1:
			dom = etree.HTML(str(ip_list_info[index]))
			ip = dom.xpath('//td[2]')
			port = dom.xpath('//td[3]')
			protocol = dom.xpath('//td[6]')
			proxys_list.append(protocol[0].text.lower() + '#' + ip[0].text + '#' + port[0].text)
	#返回代理列表
	return proxys_list
"""
函数说明:检查代理IP的连通性
Parameters:
	ip - 代理的ip地址
	lose_time - 匹配丢包数
	waste_time - 匹配平均时间
Returns:
	average_time - 代理ip平均耗时
Modify:
	2017-05-27
"""
def check_ip(ip, lose_time, waste_time):
	#命令 -n 要发送的回显请求数 -w 等待每次回复的超时时间(毫秒)
	cmd = "ping -n 3 -w 3 %s"
	#执行命令
	p = sp.Popen(cmd % ip, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True) 
	#获得返回结果并解码
	out = p.stdout.read().decode("gbk")
	#丢包数
	lose_time = lose_time.findall(out)
	#当匹配到丢失包信息失败,默认为三次请求全部丢包,丢包数lose赋值为3
	if len(lose_time) == 0:
		lose = 3
	else:
		lose = int(lose_time[0])
	#如果丢包数目大于2个,则认为连接超时,返回平均耗时1000ms
	if lose > 2:
		#返回False
		return 1000
	#如果丢包数目小于等于2个,获取平均耗时的时间
	else:
		#平均时间
		average = waste_time.findall(out)
		#当匹配耗时时间信息失败,默认三次请求严重超时,返回平均好使1000ms
		if len(average) == 0:
			return 1000
		else:
			#
			average_time = int(average[0])
			#返回平均耗时
			return average_time
"""
函数说明:初始化正则表达式
Parameters:
	无
Returns:
	lose_time - 匹配丢包数
	waste_time - 匹配平均时间
Modify:
	2017-05-27
"""
def initpattern():
	#匹配丢包数
	lose_time = re.compile(u"丢失 = (\d+)", re.IGNORECASE)
	#匹配平均时间
	waste_time = re.compile(u"平均 = (\d+)ms", re.IGNORECASE)
	return lose_time, waste_time
if __name__ == '__main__':
	#初始化正则表达式
	lose_time, waste_time = initpattern()
	#获取IP代理
	proxys_list = get_proxys(1)
	#如果平均时间超过200ms重新选取ip
	while True:
		#从100个IP中随机选取一个IP作为代理进行访问
		proxy = random.choice(proxys_list)
		split_proxy = proxy.split('#')
		#获取IP
		ip = split_proxy[1]
		#检查ip
		average_time = check_ip(ip, lose_time, waste_time)
		if average_time > 200:
			#去掉不能使用的IP
			proxys_list.remove(proxy)
			print("ip连接超时, 重新获取中!")
		if average_time < 200:
			break
	#去掉已经使用的IP
	proxys_list.remove(proxy)
	proxy_dict = {split_proxy[0]:split_proxy[1] + ':' + split_proxy[2]}
	print("使用代理:", proxy_dict)

zhihuuser

转载自 http://cuiqingcai.com/4380.html

zhihu.py

# -*- coding: utf-8 -*-
import json
 
from scrapy import Spider, Request
from zhihuq.items import UserItem
 
 
class ZhihuSpider(Spider):
    name = "zhihu"
    allowed_domains = ["www.zhihu.com"]
    user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
    follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
    followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
    start_user = 'qing-lan-98'
    user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
    follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
    followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
 
    def start_requests(self):
        yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
        yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
                      self.parse_follows)
        yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
                      self.parse_followers)
 
    def parse_user(self, response):
        result = json.loads(response.text)
        item = UserItem()
 
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)
        yield item
 
        yield Request(
            self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
            self.parse_follows)
 
        yield Request(
            self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
            self.parse_followers)
 
    def parse_follows(self, response):
        results = json.loads(response.text)
 
        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)
 
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_follows)
 
    def parse_followers(self, response):
        results = json.loads(response.text)
 
        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
                              self.parse_user)
 
        if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
            next_page = results.get('paging').get('next')
            yield Request(next_page,
                          self.parse_followers)

items.py

from scrapy import Item, Field
class UserItem(Item):
    # define the fields for your item here like:
    id = Field()
    name = Field()
    avatar_url = Field()
    headline = Field()
    description = Field()
    url = Field()
    url_token = Field()
    gender = Field()
    cover_url = Field()
    type = Field()
    badge = Field()
    answer_count = Field()
    articles_count = Field()
    commercial_question_count = Field()
    favorite_count = Field()
    favorited_count = Field()
    follower_count = Field()
    following_columns_count = Field()
    following_count = Field()
    pins_count = Field()
    question_count = Field()
    thank_from_count = Field()
    thank_to_count = Field()
    thanked_count = Field()
    vote_from_count = Field()
    vote_to_count = Field()
    voteup_count = Field()
    following_favlists_count = Field()
    following_question_count = Field()
    following_topic_count = Field()
    marked_answers_count = Field()
    mutual_followees_count = Field()
    hosted_live_count = Field()
    participated_live_count = Field()
    locations = Field()
    educations = Field()
    employments = Field()

pipelines.py

class MongoPipeline(object):
    collection_name = 'users'
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
    def close_spider(self, spider):
        self.client.close()
    def process_item(self, item, spider):
        self.db[self.collection_name].update({'url_token': item['url_token']}, {'$set': dict(item)}, True)
        return item

mysql

import pymysql
class  mysqlPipeline(object):
    def process_item(self, item, spider):
        id = item['id']
        name = item['name']
        answer_count = item['answer_count']
        articles_count = item['articles_count']        
        favorite_count = item['favorite_count']
        favorited_count = item['favorited_count']
        follower_count = item['follower_count']
        following_count = item['following_count']
        following_columns_count= item['following_columns_count']
        following_question_count = item['following_question_count']
        following_topic_count = item['following_topic_count']
        hosted_live_count = item['hosted_live_count']
        participated_live_count = item['participated_live_count']
        question_count = item['question_count']
        thanked_count= item['thanked_count']
        marked_answers_count= item['marked_answers_count']
        try:            
            gender = item['gender']
        except:
            gender = "N"
        try:            
            school = item['educations'][0]['school']['name']
        except:
            school = "N"
        try:            
            major = item['educations'][0]['major']['name']
        except:
            major = "N"
        try:            
            job = item['employments'][0]['job']['name']
        except:
            job = "N"
        try:            
            company = item['employments'][0]['company']['name']
        except:
            company = "N"
        try:            
            locations = item['locations'][0]['name']
        except:
            locations = "N"
        try:            
            headline = item['headline'] 
        except:
            headline = "N"
        # 和本地的newsDB数据库建立连接
        conn = pymysql.connect(
            host='localhost',  # 连接的是本地数据库
            user='root',  # 自己的mysql用户名
            passwd='',  # 自己的密码
            db='zhihuuser',  # 数据库的名字
            charset='utf8'  # 默认的编码方式：
            )
        
        try:
            # 使用cursor()方法获取操作游标
            cursor = conn.cursor()
            # SQL 插入语句
            sql = """INSERT INTO user(id,name,gender,school,major,job,company,locations,answer_count,articles_count,favorite_count,favorited_count,follower_count,following_count,following_columns_count,following_question_count,following_topic_count,hosted_live_count,participated_live_count,question_count,thanked_count,marked_answers_count,headline) 
                  VALUES ('%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s', '%s', '%s','%s', '%s', '%s')""" % (id,name,gender,school,major,job,company,locations,answer_count,articles_count,favorite_count,favorited_count,follower_count,following_count,following_columns_count,following_question_count,following_topic_count,hosted_live_count,participated_live_count,question_count,thanked_count,marked_answers_count,headline)
            # 执行SQL语句
            cursor.execute(sql)
            # 提交修改
            conn.commit()
        except:
            conn.rollback()
        finally:
            # 关闭连接
            conn.close()
        return item

settings.py

FEED_EXPORT_ENCODING = 'utf-8' 
DOWNLOAD_DELAY = 2
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
    'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
ITEM_PIPELINES = {
    'zhihuq.pipelines.mysqlPipeline': 300,
}

start

1	scrapy crawl zhihu -s JOBDIR=zant/001

delete duplicate data in mysql examples

1 create table new_table (select * from user group by name,age,nub having count(*)>1);
2 delete from user  where (name,age,nub)  in 
       (select * from 
       (select *  from user group by name,age,nub having count(*)>1) as b );
3 insert into user （select name,age,nub from new_table）;
4 drop table new_table;

zhihucontent

转载自http://cuiqingcai.com/4607.html

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by shimeng on 17-6-5
import os
import re
import json
import requests
import html2text
from parse_content import parse
"""
just for study and fun
Talk is cheap
show me your code
"""
class ZhiHu(object):
    def __init__(self):
         self.request_content = None
    def request(self, url, retry_times=10):
        header = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
            'Host': 'www.zhihu.com'
        }
        times = 0
        while retry_times>0:
            times += 1
            print ('request %s, times: %d' %(url, times))
            try:
                self.request_content = requests.get(url, headers=header, timeout=10).content
            except Exception as e:
                print (e)
                retry_times -= 1
            else:
                return self.request_content
    def get_all_answer_content(self, question_id, flag=2):
        first_url_format = 'https://www.zhihu.com/api/v4/questions/{}/answers?sort_by=default&include=data%5B%2A%5D.is_normal%2Cis_collapsed%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B%2A%5D.author.follower_count%2Cbadge%5B%3F%28type%3Dbest_answerer%29%5D.topics&limit=20&offset=3'
        first_url = first_url_format.format(question_id)
        response = self.request(first_url)
        if response:
            contents = json.loads(response)
            print (contents.get('paging').get('is_end'))
            while not contents.get('paging').get('is_end'):
                for content in contents.get('data'):
                    self.parse_content(content, flag)
                next_page_url = contents.get('paging').get('next').replace('http', 'https')
                contents = json.loads(self.request(next_page_url))
        else:
            raise ValueError('request failed, quit......')
    def get_single_answer_content(self, answer_url, flag=1):
        all_content = {}
        question_id, answer_id = re.findall('https://www.zhihu.com/question/(\d+)/answer/(\d+)', answer_url)[0]
        html_content = self.request(answer_url)
        if html_content:
            all_content['main_content'] = html_content
        else:
            raise  ValueError('request failed, quit......')
        ajax_answer_url = 'https://www.zhihu.com/api/v4/answers/{}'.format(answer_id)
        ajax_content = self.request(ajax_answer_url)
        if ajax_content:
            all_content['ajax_content'] = json.loads(ajax_content)
        else:
            raise  ValueError('request failed, quit......')
        self.parse_content(all_content, flag, )
    def parse_content(self, content, flag=None):
        data = parse(content, flag)
        self.transform_to_markdown(data)
    def transform_to_markdown(self, data):
        content = data['content']
        author_name = data['author_name']
        answer_id = data['answer_id']
        question_id = data['question_id']
        question_title = data['question_title']
        vote_up_count = data['vote_up_count']
        create_time = data['create_time']
        file_name = u'%s--%s的回答[%d].md' % (question_title, author_name,answer_id)
        folder_name = u'%s' % (question_title)
        if not os.path.exists(os.path.join(os.getcwd(),folder_name)):
            os.mkdir(folder_name)
        os.chdir(folder_name)
        f = open(file_name, "w")
        f.write("-" * 40 + "\n")
        origin_url = 'https://www.zhihu.com/question/{}/answer/{}'.format(question_id, answer_id)
        f.write("## 本答案原始链接: " + origin_url + "\n")
        f.write("### question_title: " + question_title + "\n")
        f.write("### Author_Name: " + author_name + "\n")
        f.write("### Answer_ID: %d" % answer_id + "\n")
        f.write("### Question_ID %d: " % question_id + "\n")
        f.write("### VoteCount: %s" % vote_up_count + "\n")
        f.write("### Create_Time: " + create_time + "\n")
        f.write("-" * 40 + "\n")
        text = html2text.html2text(content.decode('utf-8'))
        # 标题
        r = re.findall(r'\*\*(.*?)\*\*', text, re.S)
        for i in r:
            if i != " ":
                text = text.replace(i, i.strip())
        r = re.findall(r'_(.*)_', text)
        for i in r:
            if i != " ":
                text = text.replace(i, i.strip())
        text = text.replace('_ _', '')
        # 图片
        r = re.findall(r'!\[\]\((?:.*?)\)', text)
        for i in r:
            text = text.replace(i, i + "\n\n")
        f.write(text)
        f.close()
if __name__ == '__main__':
    zhihu = ZhiHu()
    url = 'https://www.zhihu.com/question/27069622/answer/214576023'
    zhihu.get_single_answer_content(url)
    #question_id = '27621722'
    #zhihu.get_all_answer_content(question_id)

parse_content.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Created by shimeng on 17-6-5
import time
from bs4 import BeautifulSoup
def html_template(data):
    # api content
    html = '''
        <html>
        <head>
        <body>
        %s
        </body>
        </head>
        </html>
        ''' % data
    return html
def parse(content, flag=None):
    data = {}
    if flag == 1:
        # single
        main_content = content.get('main_content')
        ajax_content = content.get('ajax_content')
        soup = BeautifulSoup(main_content.decode("utf-8"), "lxml")
        answer = soup.find("span", class_="RichText CopyrightRichText-richText")
        author_name = ajax_content.get('author').get('name')
        answer_id = ajax_content.get('id')
        question_id = ajax_content.get('question').get('id')
        question_title = ajax_content.get('question').get('title')
        vote_up_count = soup.find("meta", itemprop="upvoteCount")["content"]
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(ajax_content.get('created_time')))
    else:
        # all
        answer_content = content.get('content')
        author_name = content.get('author').get('name')
        answer_id = content.get('id')
        question_id = content.get('question').get('id')
        question_title = content.get('question').get('title')
        vote_up_count = content.get('voteup_count')
        create_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content.get('created_time')))
        content = html_template(answer_content)
        soup = BeautifulSoup(content, 'lxml')
        answer = soup.find("body")
    print author_name,answer_id,question_id,question_title,vote_up_count,create_time
    # 这里非原创，看了别人的代码，修改了一下
    soup.body.extract()
    soup.head.insert_after(soup.new_tag("body", **{'class': 'zhi'}))
    soup.body.append(answer)
    img_list = soup.find_all("img", class_="content_image lazy")
    for img in img_list:
        img["src"] = img["data-actualsrc"]
    img_list = soup.find_all("img", class_="origin_image zh-lightbox-thumb lazy")
    for img in img_list:
        img["src"] = img["data-actualsrc"]
    noscript_list = soup.find_all("noscript")
    for noscript in noscript_list:
        noscript.extract()
    data['content'] = soup
    data['author_name'] = author_name
    data['answer_id'] = answer_id
    data['question_id'] = question_id
    data['question_title'] = question_title
    data['vote_up_count'] = vote_up_count
    data['create_time'] = create_time
    return data

taobaocomment

using python3.6 转载自http://www.cnblogs.com/dearvee/p/6565688.html

import requests
import json
def getCommodityComments(url):
    if url[url.find('id=')+14] != '&':
        id = url[url.find('id=')+3:url.find('id=')+15]
    else:
        id = url[url.find('id=')+3:url.find('id=')+14]
    url = 'https://rate.taobao.com/feedRateList.htm?auctionNumId='+id+'&currentPageNum=1'
    res = requests.get(url)
    jc = json.loads(res.text.strip().strip('()'))
    max = jc['total']
    users = []
    comments = []
    count = 0
    page = 1
    print('该商品共有评论'+str(max)+'条,具体如下: loading...')
    while count<max:
        res = requests.get(url[:-1]+str(page))
        page = page + 1
        jc = json.loads(res.text.strip().strip('()'))
        jc = jc['comments']
        for j in jc:
            users.append(j['user']['nick'])
            comments.append( j['content'])
            print(count+1,'>>',users[count],'\n        ',comments[count])
            count = count + 1
getCommodityComments('https://item.taobao.com/item.htm?id=39595400262&')