```
import re
import urllib
import urllib.parse
import urllib.request
from urllib.error import URLError
import xlwt
from bs4 import BeautifulSoup
'''
获取双色球历史开奖数据
'''
class DoubleColorBallItem(object):
date = None # 开奖日期
order = None # 期号
red1 = None # 红球1
red2 = None # 红球2
red3 = None # 红球3
red4 = None # 红球4
red5 = None # 红球5
red6 = None # 红球6
blue = None # 篮球
money = None # 彩池奖金
firstPrize = None # 一等奖中奖人数
secondPrize = None # 二等奖中奖人数
class ExportToExcel(object):
def __init__(self, fields, items, fileName, sheetName):
self.fields = fields
self.items = items
self.fileName = fileName
self.sheetName = sheetName
self.run(fields, items)
def run(self, fields, items):
book = xlwt.Workbook(encoding='utf8')
sheet = book.add_sheet(self.sheetName, cell_overwrite_ok=True)
row = 0
if fields is not None:
col = 0
for field in fields:
sheet.write(row, col, field)
col += 1
row += 1
if items is not None:
for item in items:
col = 0
while col < len(item):
sheet.write(row, col, item[col])
col += 1
row += 1
book.save(self.fileName)
class GetDoubleColorBallNumber(object):
def __init__(self):
self.urls = self.getUrls()
self.items = self.spider(self.urls)
self.pipelines(self.items)
# self.exportToExcel(self.items, '双色球.xls')
fields = ['开奖日期', '期号', '红1', '红2', '红3', '红4',
'红5', '红6', '蓝', '销售金额', '一等奖', '二等奖']
rows = []
for item in self.items:
data = []
data.append(item.date)
data.append(item.order)
data.append(item.red1)
data.append(item.red2)
data.append(item.red3)
data.append(item.red4)
data.append(item.red5)
data.append(item.red6)
data.append(item.blue)
data.append(item.money)
data.append(item.firstPrize)
data.append(item.secondPrize)
rows.append(data)
ExportToExcel(fields=fields, items=rows,
fileName='双色球.xls', sheetName='ball')
# 获取所有需要采集的页面
def getUrls(self):
urls = []
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_1.html'
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'lxml')
# body > table > tbody > tr: nth-child(23) > td > p.pg > strong: nth-child(1)
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
for i in range(1, int(pages) + 1):
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + \
str(i) + '.html'
urls.append(url)
return urls
# 采集每一页面的内容
def spider(self, urls):
items = []
for url in urls:
html = self.getResponseContent(url)
if html is not None:
soup = BeautifulSoup(html, 'lxml')
tags = soup.find_all('tr', attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].strong.get_text().replace(
',', '')
item.firstPrize = tagTd[4].strong.get_text().replace(
',', '')
item.secondPrize = tagTd[5].strong.get_text().replace(
',', '')
items.append(item)
# cols=[]
# df = pd.DataFrame(js['record'], columns=cols)
return items
# 保存数据到Excel文件
def exportToExcel(self, items, fileName):
book = xlwt.Workbook(encoding='utf8')
sheet = book.add_sheet('ball', cell_overwrite_ok=True)
sheet.write(0, 0, '开奖日期')
sheet.write(0, 1, '期号')
sheet.write(0, 2, '红1')
sheet.write(0, 3, '红2')
sheet.write(0, 4, '红3')
sheet.write(0, 5, '红4')
sheet.write(0, 6, '红5')
sheet.write(0, 7, '红6')
sheet.write(0, 8, '蓝')
sheet.write(0, 9, '销售金额')
sheet.write(0, 10, '一等奖')
sheet.write(0, 11, '二等奖')
i = 1
while i <= len(items):
item = items[i - 1]
sheet.write(i, 0, item.date)
sheet.write(i, 1, item.order)
sheet.write(i, 2, item.red1)
sheet.write(i, 3, item.red2)
sheet.write(i, 4, item.red3)
sheet.write(i, 5, item.red4)
sheet.write(i, 6, item.red5)
sheet.write(i, 7, item.red6)
sheet.write(i, 8, item.blue)
sheet.write(i, 9, item.money)
sheet.write(i, 10, item.firstPrize)
sheet.write(i, 11, item.secondPrize)
i += 1
book.save(fileName)
# 输出采集结果
def pipelines(self, items):
fileName = '双色球.txt'
with open(fileName, 'w') as fp:
for item in items:
fp.write('%s %s \t %s %s %s %s %s %s %s \t %s \t %s %s \n' % (
item.date, item.order, item.red1, item.red2, item.red3, item.red4, item.red5, item.red6, item.blue,
item.money, item.firstPrize, item.secondPrize))
def getResponseContent(self, url):
html = self.download(url)
return html
# 下载页面内容
def download(self, url, user_agent='wswp', proxy=None, num_retries=3):
print("Downloading:%s" % url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
html = None
try:
html = opener.open(request).read()
html = html.decode('utf-8') # python3
except URLError as e:
if num_retries > 0:
print("Download error:(code:%s,reson:%s)" %
(e.errno, e.reason))
html = None
if hasattr(e, 'code') and 500 <= e.code < 600:
print("Retrying .... ")
return self.download(url, user_agent, proxy, num_retries - 1)
return html
if __name__ == '__main__':
GetDoubleColorBallNumber()
```
采集结果示例
```
2019-04-21 2019045 01 06 17 19 27 31 14 380577960 7 83
2019-04-18 2019044 06 14 16 17 23 29 07 350579948 12 188
2019-04-16 2019043 01 06 12 13 24 32 13 345296562 2 99
2019-04-14 2019042 15 17 19 22 25 26 04 376134240 4 132
2019-04-11 2019041 02 09 13 23 24 26 16 347145834 2 103
2019-04-09 2019040 05 06 09 18 23 31 11 335974062 8 159
2019-04-07 2019039 06 07 11 14 27 32 08 363128750 11 136
2019-04-04 2019038 09 12 21 27 29 30 05 335328036 10 101
2019-04-02 2019037 01 07 12 14 18 25 10 344923376 12 213
```