4、Scrapy框架，爬取网站返回json数据（spider源码) · 技术-开发笔记

~~~ # -*- coding: utf-8 -*- import scrapy from scrapy.http import Request from kunnanyuan.spider.spider.common import deal_date, transfrom, get_id from ..items import XkItem import json class XkSdl10822Spider(scrapy.Spider): name = 'XK-FJM-0102' url = 'http://222.76.243.118:8090/publicity/get_double_publicity_record_list' #构造请求头（postman，网站调试器，apipost，当然我推荐的是博客————爬虫骚操作之30秒写爬虫（实用）直接转化为python格式复制过来就行，几秒解决） headers = { 'Origin': 'http://222.76.243.118:8090', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36', #这里必须有，才会有下面的解析方式 'Content-Type': 'application/json; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'Referer': 'http://222.76.243.118:8090/page/double_publicity/allow.html', 'Connection': 'keep-alive', } #构造起始start_requests def start_requests(self): #偷懒没写遍历去限制每次请求的数据，直接一下拿了，主要是为了后面增量考虑 data = { 'listSql': '', 'linesPerPage': "6704", 'currentPage': "1", 'deptId': '', 'searchKeyword': '', 'tag': 'ALLOW' } yield scrapy.Request(url=self.url, body=json.dumps(data), method='POST', headers=self.headers,callback=self.parse_list) #把数据切割分成一页多少个 # def parse_page(self, response): # self.parse_list(response) # if self.page == 1: #以下省略 def parse_list(self, response): #返回json，转化python字典类型 tr1 = json.loads(response.text) #把 tr1看成一个大字典，取键拿值 if tr1.get("message") == "请求成功": data = tr1.get('data')#也可以写成 data = tr1.【'data'】下面类似 list = data.get('list') #这里就是遍历每个json里面的数据了 for i in list: if i['legalPersonDocNumber'] is not None: identifier = i['legalPersonDocNumber'] else: identifier = i['naturalPersonDocNumber'] if i['jgFr'] is not None: organization = i['jgFr'] else: organization = i['jgZr'] businessId = i['businessId'] id = i['id'] objectType = i['objectType'] createdAt = deal_date(i['businessCreateDate'].split('000000')[0]) source_url = "http://222.76.243.118:8090/page/double_publicity/publicity_detail.html" + "?id={}&businessId={}&tag=ALLOW&objectType={}".format(str(id), str(businessId),str(objectType)) prPrincipal = i['objectName'] data = { "businessId": businessId, "id": id, 'objectType': objectType, 'tag': "ALLOW", 'pictureMinHeight': '628', 'pictureMinWidth': '1200' } url = "http://222.76.243.118:8090/publicity/get_publicity_detail_picture" yield Request(url, callback=self.parse4, body=json.dumps(data), method='POST', headers=self.headers, meta={"identifier": identifier, "organization": organization, "businessId": businessId, "createdAt": createdAt, "source_url": source_url, "prPrincipal": prPrincipal}) #解析并把item传出去 def parse4(self, response): item = XkItem() item['identifier'] = response.meta["identifier"] item['organization'] = response.meta["organization"] print(item['organization']) # item['businessId'] = response.meta["businessId"] item['createdAt'] = response.meta["createdAt"] item['source_url'] = response.meta['source_url'] item['prPrincipal'] = response.meta['prPrincipal'] item['type'] = transfrom(str(item['organization'])) item['fileType'] = "jpg" item['pid'] = get_id(str(item['identifier'])) item['idMethod'] = '2' tr2 = json.loads(response.text) if tr2.get("message") == "请求成功": data = tr2.get('data') path = data.get('path') item['images'] = "http://222.76.243.118:8090/" + path yield item ~~~ 或者： ~~~ #coding=utf-8 import scrapy import json class DmozSpider(scrapy.Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.test.com/test/get_data" ] def parse(self, response): # 调用body_as_unicode()是为了能处理unicode编码的数据 sites = json.loads(response.body_as_unicode()) #print sites['k'] numbers = sites['k'].split(',') print numbers ~~~