~~~
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
from kunnanyuan.spider.spider.common import deal_date, transfrom, get_id
from ..items import XkItem
import json
class XkSdl10822Spider(scrapy.Spider):
name = 'XK-FJM-0102'
url = 'http://222.76.243.118:8090/publicity/get_double_publicity_record_list'
#构造请求头(postman,网站调试器,apipost,当然我推荐的是博客————爬虫骚操作之30秒写爬虫(实用)直接转化为python格式复制过来就行,几秒解决)
headers = {
'Origin': 'http://222.76.243.118:8090',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36',
#这里必须有,才会有下面的解析方式
'Content-Type': 'application/json; charset=UTF-8',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Referer': 'http://222.76.243.118:8090/page/double_publicity/allow.html',
'Connection': 'keep-alive',
}
#构造起始start_requests
def start_requests(self):
#偷懒没写遍历去限制每次请求的数据,直接一下拿了,主要是为了后面增量考虑
data = {
'listSql': '',
'linesPerPage': "6704",
'currentPage': "1",
'deptId': '',
'searchKeyword': '',
'tag': 'ALLOW'
}
yield scrapy.Request(url=self.url, body=json.dumps(data), method='POST', headers=self.headers,callback=self.parse_list)
#把数据切割分成一页多少个
# def parse_page(self, response):
# self.parse_list(response)
# if self.page == 1:
#以下省略
def parse_list(self, response):
#返回json,转化python字典类型
tr1 = json.loads(response.text)
#把 tr1看成一个大字典,取键拿值
if tr1.get("message") == "请求成功":
data = tr1.get('data')#也可以写成 data = tr1.【'data'】下面类似
list = data.get('list')
#这里就是遍历每个json里面的数据了
for i in list:
if i['legalPersonDocNumber'] is not None:
identifier = i['legalPersonDocNumber']
else:
identifier = i['naturalPersonDocNumber']
if i['jgFr'] is not None:
organization = i['jgFr']
else:
organization = i['jgZr']
businessId = i['businessId']
id = i['id']
objectType = i['objectType']
createdAt = deal_date(i['businessCreateDate'].split('000000')[0])
source_url = "http://222.76.243.118:8090/page/double_publicity/publicity_detail.html" + "?id={}&businessId={}&tag=ALLOW&objectType={}".format(str(id), str(businessId),str(objectType))
prPrincipal = i['objectName']
data = {
"businessId": businessId,
"id": id,
'objectType': objectType,
'tag': "ALLOW",
'pictureMinHeight': '628',
'pictureMinWidth': '1200'
}
url = "http://222.76.243.118:8090/publicity/get_publicity_detail_picture"
yield Request(url, callback=self.parse4, body=json.dumps(data), method='POST', headers=self.headers,
meta={"identifier": identifier, "organization": organization, "businessId": businessId,
"createdAt": createdAt, "source_url": source_url, "prPrincipal": prPrincipal})
#解析 并把item传出去
def parse4(self, response):
item = XkItem()
item['identifier'] = response.meta["identifier"]
item['organization'] = response.meta["organization"]
print(item['organization'])
# item['businessId'] = response.meta["businessId"]
item['createdAt'] = response.meta["createdAt"]
item['source_url'] = response.meta['source_url']
item['prPrincipal'] = response.meta['prPrincipal']
item['type'] = transfrom(str(item['organization']))
item['fileType'] = "jpg"
item['pid'] = get_id(str(item['identifier']))
item['idMethod'] = '2'
tr2 = json.loads(response.text)
if tr2.get("message") == "请求成功":
data = tr2.get('data')
path = data.get('path')
item['images'] = "http://222.76.243.118:8090/" + path
yield item
~~~
或者:
~~~
#coding=utf-8
import scrapy
import json
class DmozSpider(scrapy.Spider):
name = "dmoz"
allowed_domains = ["dmoz.org"]
start_urls = [
"http://www.test.com/test/get_data"
]
def parse(self, response):
# 调用body_as_unicode()是为了能处理unicode编码的数据
sites = json.loads(response.body_as_unicode())
#print sites['k']
numbers = sites['k'].split(',')
print numbers
~~~
- thinkphp
- thinkphp笔记
- 后台登陆退出
- config配置
- 隐藏后台模块
- 单独调用腾讯云行为验证码
- api接口跨域问题
- api接口创建案例代码
- 使用gateway worker
- 使用swoole代码笔记
- 使用队列 think-queue笔记
- 后台布局
- MySQL
- 1、关于lnmp mysql的一个坑
- 2、mysql实现group by后取各分组的最新一条
- 其他
- 搞笑的注释代码
- 分页类
- nodejs 打包网址为exe
- 免费天气预报API接口
- Ajax
- 简单的ajax分页1
- 通用ajax-post提交
- 引用的类库文件
- Auth.php
- Auth.php权限控制对应的数据库表结构
- Layui.php
- Pinyin.php
- Random.php
- Tree.php
- Tree2.php
- Js-Jq
- Git的使用
- 3、bootstrap-datetimepicker实现两个时间范围输入
- CentOS安装SSR做梯子
- Python爬虫
- 1、安装Gerapy
- 2、安装Scrapy
- 3、Scrapy使用
- 4、Scrapy框架,爬取网站返回json数据(spider源码)
- 0、Python pip更换国内源(一句命令换源)
- 服务器运维
- 1、宝塔使用webhook更新服务器代码
- 2、搭建内网穿透
- 3、数据库主从同步
- 4、数据库复制
- hui-Shop问题
- 1、前端模板的注意事项
- 2、模板标签