python web crawling using scrapy(1)

web crawling examples with python using scrapy

QuotesBot using scrapy

spider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# toscrape-css.py
# -*- coding: utf-8 -*-
import scrapy
class ToScrapeCSSSpider(scrapy.Spider):
name = "toscrape-css"
start_urls = [
'http://quotes.toscrape.com/',
]
def parse(self, response):
for quote in response.css("div.quote"):
yield {
'text': quote.css("span.text::text").extract_first(),
'author': quote.css("small.author::text").extract_first(),
'tags': quote.css("div.tags > a.tag::text").extract()
}
next_page_url = response.css("li.next > a::attr(href)").extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))
#toscrape-xpath.py
# -*- coding: utf-8 -*-
import scrapy
class ToScrapeSpiderXPath(scrapy.Spider):
name = 'toscrape-xpath'
start_urls = [
'http://quotes.toscrape.com/',
]
def parse(self, response):
for quote in response.xpath('//div[@class="quote"]'):
yield {
'text': quote.xpath('./span[@class="text"]/text()').extract_first(),
'author': quote.xpath('.//small[@class="author"]/text()').extract_first(),
'tags': quote.xpath('.//div[@class="tags"]/a[@class="tag"]/text()').extract()
}
next_page_url = response.xpath('//li[@class="next"]/a/@href').extract_first()
if next_page_url is not None:
yield scrapy.Request(response.urljoin(next_page_url))

Running the spiders

1
$ scrapy crawl toscrape-css -o quotes.json

repost site

w3school.com.cn

spider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import scrapy
class W3schoolSpider(scrapy.Spider):
"""爬取w3school标签"""
#log.start("log",loglevel='INFO')
name = "w3school"
allowed_domains = ["w3school.com.cn"]
start_urls = [
"http://www.w3school.com.cn/xml/xml_syntax.asp"
]
def parse(self, response):
for site in response.xpath('//div[@id="navsecond"]/div[@id="course"]/ul[1]/li'):
yield {
'title': site.xpath('a/text()').extract_first(),
'link' : site.xpath('a/@href').extract_first(),
'desc' : site.xpath('a/@title').extract()
}

settings.py

1
FEED_EXPORT_ENCODING = 'utf-8' #unicode to utf8

csdnblog

spider.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
import scrapy
class CSDNBlogSpider(scrapy.Spider):
"""爬虫CSDNBlogSpider"""
name = "CSDNBlog"
#减慢爬取速度 为1s
download_delay = 1
start_urls = [
#第一篇文章地址
"http://blog.csdn.net/u012150179/article/details/11749017"
]
def parse(self, response):
for site in response.xpath('//div[@id="article_details"]/div[1]/h1/span'):
yield {
'article_url' : str(response.url),
'article_name' : site.xpath('a/text()').extract_first()
}
#获得下一篇文章的url
urls = response.xpath('//li[@class="next_article"]/a/@href').extract_first()
if urls is not None:
yield scrapy.Request(response.urljoin(urls))

settings.py

1
FEED_EXPORT_ENCODING = 'utf-8' #unicode to utf8

stock163

转载自 http://blog.csdn.net/c406495762/article/details/77801899 ,建表cwzb,lrb,fzb,llb,字段添加股票名和股票代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#-*- coding:UTF-8 -*-
import pymysql
import requests
import json
import re
from bs4 import BeautifulSoup
if __name__ == '__main__':
#打开数据库连接:host-连接主机地址,port-端口号,user-用户名,passwd-用户密码,db-数据库名,charset-编码
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='yourpasswd',db='financialdata',charset='utf8')
#使用cursor()方法获取操作游标
cursor = conn.cursor()
#主要财务指标
cwzb_dict = {'EPS':'基本每股收益','EPS_DILUTED':'摊薄每股收益','GROSS_MARGIN':'毛利率',
'CAPITAL_ADEQUACY':'资本充足率','LOANS_DEPOSITS':'贷款回报率','ROTA':'总资产收益率',
'ROEQUITY':'净资产收益率','CURRENT_RATIO':'流动比率','QUICK_RATIO':'速动比率',
'ROLOANS':'存贷比','INVENTORY_TURNOVER':'存货周转率','GENERAL_ADMIN_RATIO':'管理费用比率',
'TOTAL_ASSET2TURNOVER':'资产周转率','FINCOSTS_GROSSPROFIT':'财务费用比率','TURNOVER_CASH':'销售现金比率','YEAREND_DATE':'报表日期'}
#利润表
lrb_dict = {'TURNOVER':'总营收','OPER_PROFIT':'经营利润','PBT':'除税前利润',
'NET_PROF':'净利润','EPS':'每股基本盈利','DPS':'每股派息',
'INCOME_INTEREST':'利息收益','INCOME_NETTRADING':'交易收益','INCOME_NETFEE':'费用收益','YEAREND_DATE':'报表日期'}
#资产负债表
fzb_dict = {
'FIX_ASS':'固定资产','CURR_ASS':'流动资产','CURR_LIAB':'流动负债',
'INVENTORY':'存款','CASH':'现金及银行存结','OTHER_ASS':'其他资产',
'TOTAL_ASS':'总资产','TOTAL_LIAB':'总负债','EQUITY':'股东权益',
'CASH_SHORTTERMFUND':'库存现金及短期资金','DEPOSITS_FROM_CUSTOMER':'客户存款',
'FINANCIALASSET_SALE':'可供出售之证券','LOAN_TO_BANK':'银行同业存款及贷款',
'DERIVATIVES_LIABILITIES':'金融负债','DERIVATIVES_ASSET':'金融资产','YEAREND_DATE':'报表日期'}
#现金流表
llb_dict = {
'CF_NCF_OPERACT':'经营活动产生的现金流','CF_INT_REC':'已收利息','CF_INT_PAID':'已付利息',
'CF_INT_REC':'已收股息','CF_DIV_PAID':'已派股息','CF_INV':'投资活动产生现金流',
'CF_FIN_ACT':'融资活动产生现金流','CF_BEG':'期初现金及现金等价物','CF_CHANGE_CSH':'现金及现金等价物净增加额',
'CF_END':'期末现金及现金等价物','CF_EXCH':'汇率变动影响','YEAREND_DATE':'报表日期'}
#总表
table_dict = {'cwzb':cwzb_dict,'lrb':lrb_dict,'fzb':fzb_dict,'llb':llb_dict}
#请求头
headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36',}
#上市股票地址
target_url = 'http://quotes.money.163.com/hkstock/cwsj_00700.html'
req = requests.get(url = target_url, headers = headers)
req.encoding = 'utf-8'
html = req.text
page_bf = BeautifulSoup(html, 'lxml')
#股票名称,股票代码
name = page_bf.find_all('span', class_ = 'name')[0].string
code = page_bf.find_all('span', class_ = 'code')[0].string
code = re.findall('\d+',code)[0]
#打印股票信息
print(name + ':' + code)
print('')
#存储各个表名的列表
table_name_list = []
table_date_list = []
each_date_list = []
url_list = []
#表名和表时间
table_name = page_bf.find_all('div', class_ = 'titlebar3')
for each_table_name in table_name:
#表名
table_name_list.append(each_table_name.span.string)
#表时间
for each_table_date in each_table_name.div.find_all('select', id = re.compile('.+1$')):
url_list.append(re.findall('(\w+)1',each_table_date.get('id'))[0])
for each_date in each_table_date.find_all('option'):
each_date_list.append(each_date.string)
table_date_list.append(each_date_list)
each_date_list = []
#插入信息
for i in range(len(table_name_list)):
print('表名:',table_name_list[i])
print('')
#获取数据地址
url = 'http://quotes.money.163.com/hk/service/cwsj_service.php?symbol={}&start={}&end={}&type={}&unit=yuan'.format(code,table_date_list[i][-1],table_date_list[i][0],url_list[i])
req_table = requests.get(url = url, headers = headers)
value_dict = {}
for each_data in req_table.json():
value_dict['股票名'] = name
value_dict['股票代码'] = code
for key, value in each_data.items():
if key in table_dict[url_list[i]]:
value_dict[table_dict[url_list[i]][key]] = value
# print(value_dict)
sql1 = """
INSERT INTO %s (`股票名`,`股票代码`,`报表日期`) VALUES ('%s','%s','%s')""" % (url_list[i],value_dict['股票名'],value_dict['股票代码'],value_dict['报表日期'])
print(sql1)
try:
cursor.execute(sql1)
# 执行sql语句
conn.commit()
except:
# 发生错误时回滚
conn.rollback()
for key, value in value_dict.items():
if key not in ['股票名','股票代码','报表日期']:
sql2 = """
UPDATE %s SET %s='%s' WHERE `股票名`='%s' AND `报表日期`='%s'""" % (url_list[i],key,value,value_dict['股票名'],value_dict['报表日期'])
print(sql2)
try:
cursor.execute(sql2)
# 执行sql语句
conn.commit()
except:
# 发生错误时回滚
conn.rollback()
value_dict = {}
# 关闭数据库连接
cursor.close()
conn.close()

comic

转载自http://blog.csdn.net/c406495762/article/details/72858983

comic.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy import Selector
from cartoon.items import ComicItem
class ComicSpider(scrapy.Spider):
name = 'comic'
def __init__(self):
#图片链接server域名
self.server_img = 'http://n.1whour.com/'
#章节链接server域名
self.server_link = 'http://comic.kukudm.com'
self.allowed_domains = ['comic.kukudm.com']
self.start_urls = ['http://comic.kukudm.com/comiclist/3/']
#匹配图片地址的正则表达式
self.pattern_img = re.compile(r'\+"(.+)\'><span')
#从start_requests发送请求
def start_requests(self):
yield scrapy.Request(url = self.start_urls[0], callback = self.parse1)
#解析response,获得章节图片链接地址
def parse1(self, response):
hxs = Selector(response)
items = []
#章节链接地址
urls = hxs.xpath('//dd/a[1]/@href').extract()
#章节名
dir_names = hxs.xpath('//dd/a[1]/text()').extract()
#保存章节链接和章节名
for index in range(len(urls)):
item = ComicItem()
item['link_url'] = self.server_link + urls[index]
item['dir_name'] = dir_names[index]
items.append(item)
#根据每个章节的链接,发送Request请求,并传递item参数
for item in items:
yield scrapy.Request(url = item['link_url'], meta = {'item':item}, callback = self.parse2)
#解析获得章节第一页的页码数和图片链接
def parse2(self, response):
#接收传递的item
item = response.meta['item']
#获取章节的第一页的链接
item['link_url'] = response.url
hxs = Selector(response)
#获取章节的第一页的图片链接
pre_img_url = hxs.xpath('//script/text()').extract()
#注意这里返回的图片地址,应该为列表,否则会报错
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url[0])[0]]
#将获取的章节的第一页的图片链接保存到img_url中
item['img_url'] = img_url
#返回item,交给item pipeline下载图片
yield item
#获取章节的页数
page_num = hxs.xpath('//td[@valign="top"]/text()').re(u'共(\d+)页')[0]
#根据页数,整理出本章节其他页码的链接
pre_link = item['link_url'][:-5]
for each_link in range(2, int(page_num) + 1):
new_link = pre_link + str(each_link) + '.htm'
#根据本章节其他页码的链接发送Request请求,用于解析其他页码的图片链接,并传递item
yield scrapy.Request(url = new_link, meta = {'item':item}, callback = self.parse3)
#解析获得本章节其他页面的图片链接
def parse3(self, response):
#接收传递的item
item = response.meta['item']
#获取该页面的链接
item['link_url'] = response.url
hxs = Selector(response)
pre_img_url = hxs.xpath('//script/text()').extract()
#注意这里返回的图片地址,应该为列表,否则会报错
img_url = [self.server_img + re.findall(self.pattern_img, pre_img_url[0])[0]]
#将获取的图片链接保存到img_url中
item['img_url'] = img_url
#返回item,交给item pipeline下载图片
yield item

pipelines.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from cartoon import settings
from scrapy import Request
import requests
import os
class ComicImgDownloadPipeline(object):
def process_item(self, item, spider):
#如果获取了图片链接,进行如下操作
if 'img_url' in item:
images = []
#文件夹名字
dir_path = '%s/%s' % (settings.IMAGES_STORE, item['dir_name'])
#文件夹不存在则创建文件夹
if not os.path.exists(dir_path):
os.makedirs(dir_path)
#获取每一个图片链接
for image_url in item['img_url']:
#解析链接,根据链接为图片命名
houzhui = image_url.split('/')[-1].split('.')[-1]
qianzhui = item['link_url'].split('/')[-1].split('.')[0]
#图片名
image_file_name = '第' + qianzhui + '页.' + houzhui
#图片保存路径
file_path = '%s/%s' % (dir_path, image_file_name)
images.append(file_path)
if os.path.exists(file_path):
continue
#保存图片
with open(file_path, 'wb') as handle:
response = requests.get(url = image_url)
for block in response.iter_content(1024):
if not block:
break
handle.write(block)
#返回图片保存路径
item['image_paths'] = images
return item

settings.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
BOT_NAME = 'cartoon'
SPIDER_MODULES = ['cartoon.spiders']
NEWSPIDER_MODULE = 'cartoon.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'cartoon (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'cartoon.pipelines.ComicImgDownloadPipeline': 1,
}
IMAGES_STORE = 'D:/火影忍者'
COOKIES_ENABLED = False
DOWNLOAD_DELAY = 0.25 # 250 ms of delay

items.py

1
2
3
4
5
6
7
import scrapy
class ComicItem(scrapy.Item):
dir_name = scrapy.Field()
link_url = scrapy.Field()
img_url = scrapy.Field()
image_paths = scrapy.Field()