![](https://img.kancloud.cn/41/e0/41e066af9a6c25a24868d9667253ec98_1241x333.jpg)
*****
## 亚马逊爬虫
- 需求:抓取亚马逊图书的信息
- 目标:抓取亚马逊图书大分类,图书URL地址,列表翻页地址,图书名字,图书作者,图书价格
- URL地址:[https://www.amazon.cn/图书/b/ref=sd\_allcat\_books\_l1?ie=UTF8&node=658390051](https://www.amazon.cn/%E5%9B%BE%E4%B9%A6/b/ref=sd_allcat_books_l1?ie=UTF8&node=658390051)
```
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re
from scrapy_redis.spiders import RedisCrawlSpider
"""
目标:抓取亚马逊图书信息, 有图书的名字、封面图片地址、图书url地址、作者、出版社、出版时间、价格、图书所属大分类、图书所属小的分类、分类的url地址
思路:
1. 先完成scrapy的CrawlSpider
2. 改为RedisCrawlSpider
2.1 修改继承关系, 继承RedisCrawlSpider
2.2 start_urls 改为 redis_key
2.3 修改配置文件(多个爬虫配置一次就可以了)
"""
# 2.1 修改继承关系, 继承RedisCrawlSpider
class AmazonSpider(RedisCrawlSpider):
name = 'amazon'
allowed_domains = ['amazon.cn']
# 修改起始的URL
# start_urls = ['https://www.amazon.cn/图书/b/ref=sa_menu_top_books_l1?ie=UTF8&node=658390051']
# 2.2 start_urls 改为 redis_key
# 用于指定起始URL在redis数据库的key
redis_key = 'amazon:start_urls'
rules = (
# 1. 提取分类的URL
# restrict_xpaths: 用于指定从那一块区域中提取链接
Rule(LinkExtractor(restrict_xpaths='//*[@id="leftNav"]/ul[1]/ul/div/li'), follow=True),
# 2. 提取列表页分页的URL
Rule(LinkExtractor(restrict_xpaths='//*[@id="pagn"]'), follow=True),
# 3. 提取的详情URL
Rule(LinkExtractor(restrict_xpaths='//a[contains(@class, "s-access-detail-page")]'), callback='parse_item'),
)
def parse_item(self, response):
# 解析详情页数据
# print(response.url)
item = {}
# 有图书的名字
item['book_name'] = response.xpath('//*[contains(@id,"roductTitle")]/text()').extract_first()
# 封面图片地址
item['book_img'] = response.xpath('//*[contains(@id, "mgBlkFront")]/@src').extract_first()
# 图书url地址
item['book_url'] = response.url
# 作者
item['book_author'] = ''.join(response.xpath('//*[@id="bylineInfo"]/span/a/text()').extract())
# 价格
item['book_price'] = response.xpath('//span[contains(@class, "a-color-price")]/text()').extract_first()
publish = re.findall('<li><b>出版社:</b> (.+?);.*?\((.+?)\)</li>', response.text)
if len(publish) != 0:
# print(publish) # [('中信出版社', '2018年7月1日')]
# 出版社
item['book_publisher'] = publish[0][0]
# 出版时间
item['book_publish_date'] = publish[0][1]
# 图书所属大分类
# 获取包含分类信息的a标签列表
a_s = response.xpath('//span[@class="a-list-item"]/a[text()]')
# 获取大分类
if len(a_s) > 0:
item['b_category_name'] = a_s[0].xpath('./text()').extract_first().strip()
item['b_category_url'] = response.urljoin(a_s[0].xpath('./@href').extract_first())
# 中分类
if len(a_s) > 1:
item['m_category_name'] = a_s[1].xpath('./text()').extract_first().strip()
item['m_category_url'] = response.urljoin(a_s[1].xpath('./@href').extract_first())
# 图书所属小的分类
if len(a_s) > 2:
item['s_category_name'] = a_s[2].xpath('./text()').extract_first().strip()
item['s_category_url'] = response.urljoin(a_s[2].xpath('./@href').extract_first())
# 把数据交给引擎
# print(item)
yield item
```