### **第一步:创建xxx项目**
~~~
scrapy startproject Tencentjob
~~~
### **第二步:创建要抓取的名称及抓取网址**
~~~
scrapy genspider cententjob 'https://hr.tencent.com/position.php'
~~~
### **第三步:编写items.py,明确需要提取的数据**
~~~
import scrapy
class CententjobItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link =scrapy.Field()
category=scrapy.Field()
number=scrapy.Field()
address=scrapy.Field()
create_time=scrapy.Field()
~~~
### **第四步:编写spiders/xxx.py 编写爬虫文件,处理请求和响应,以及提取数据(yeild item)**
~~~
import scrapy
from cententjob.items import CententjobItem
class CententjobSpider(scrapy.Spider):
print(CententjobItem)
name='cententjob'
#允许抓取的域名,可选
#allowed_domains=['centent.com']
#要实现翻页
baseUrl='https://hr.tencent.com/position.php'
offset=0
start_urls=[baseUrl+str(offset)]
def parse(self,response):
node_list=response.xpath("//tr[@class='even'] | //tr[@class='odd']" )
for node in node_list:
jobs=CententjobItem()
#提取每个职位的信息,并把Unicode数据转化为uft-8数据
jobs['title']=node.xpath("./td[1]/a/text()").extract()[0]
jobs['link']=node.xpath("./td[1]/a/@href").extract()[0]
try:
jobs['category']=node.xpath("./td[2]/text()").extract()[0]
except:
jobs['category']=''
jobs['number']=node.xpath("./td[3]/text()").extract()[0]
jobs['address']=node.xpath("./td[4]/text()").extract()[0]
jobs['create_time']=node.xpath("./td[5]/text()").extract()[0]
yield jobs
if not response.xpath("//a[@class='noactive' and @id='next']/@href"):
url=response.xpath("//a[@id='next']/@href").extract()[0]
allUrl='https://hr.tencent.com/'+url
yield scrapy.Request(allUrl,callback=self.parse)
# if self.offset < 3110 :
# self.offset +=10
# url=self.baseUrl + str(self.offset)
# yield scrapy.Request(url,callback=self.parse)
~~~
### **第五步:编写pipelines.py管道文件,处理spider返回item数据**
~~~
import MySQLdb
class CententjobPipeline(object):
def __init__(self):
# 连接数据库
self.con=MySQLdb.connect(
host='127.0.0.1',
port=3306,
db='ganji',
user='root',
passwd='123456',
charset='utf8',
use_unicode=True,
)
# 通过cu执行增删查改
self.cu=self.con.cursor()
def open_spider(self,spider):
print('我要开始了哦')
def process_item(self, item, spider):
try:
insert_sql="insert into centent (title,link,category,number,address,create_time) value ('{}','{}','{}','{}','{}','{}')".format(item['title'],item['link'],item['category'],item['number'],item['address'],item['create_time'])
#print(insert_sql)
self.cu.execute(insert_sql)
# 是否有重复数据
repetition = self.cu.fetchone()
# 重复
if repetition:
print('我重复啦')
else:
self.con.commit()
except Exception as error:
# 出现错误时打印错误日志
log(error)
return item
def close_spider(self,spider):
self.con.close()
print('我结束了')
~~~
### **第六步:编写settings.py,启动管理文件,以及其他相关设置**
~~~
ITEM_PIPELINES = {
'cententjob.pipelines.CententjobPipeline': 300,
}
~~~
### **第七步:执行爬虫**
~~~
C:\Users\Administrator\Desktop\cententjob>scrapy list
<class 'cententjob.items.CententjobItem'>
cententjob
C:\Users\Administrator\Desktop\cententjob>scrapy crawl cententjob
~~~
接下来就可以坐等抓取了。