```
import re
import urllib
import urllib.parse
import urllib.request
from urllib.error import URLError
from bs4 import BeautifulSoup
'''
获取双色球历史开奖数据
'''
class DoubleColorBallItem(object):
date = None # 开奖日期
order = None # 期号
red1 = None # 红球1
red2 = None # 红球2
red3 = None # 红球3
red4 = None # 红球4
red5 = None # 红球5
red6 = None # 红球6
blue = None # 篮球
money = None # 彩池奖金
firstPrize = None # 一等奖中奖人数
secondPrize = None # 二等奖中奖人数
class GetDoubleColorBallNumber(object):
def __init__(self):
urls = self.getUrls()
items = self.spider(urls)
self.pipelines(items)
# 输出采集结果
def pipelines(self, items):
fileName = 'ssq.csv'
with open(fileName, 'w') as fp:
for item in items:
fp.write('%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n' % (
item.date, item.order, item.red1, item.red2, item.red3, item.red4, item.red5, item.red6, item.blue,
item.money, item.firstPrize, item.secondPrize))
# 采集每一页面的内容
def spider(self, urls):
items = []
for url in urls:
html = self.getResponseContent(url)
if html is not None:
soup = BeautifulSoup(html, 'lxml')
tags = soup.find_all('tr', attrs={})
for tag in tags:
if tag.find('em'):
item = DoubleColorBallItem()
tagTd = tag.find_all('td')
item.date = tagTd[0].get_text()
item.order = tagTd[1].get_text()
tagEm = tagTd[2].find_all('em')
item.red1 = tagEm[0].get_text()
item.red2 = tagEm[1].get_text()
item.red3 = tagEm[2].get_text()
item.red4 = tagEm[3].get_text()
item.red5 = tagEm[4].get_text()
item.red6 = tagEm[5].get_text()
item.blue = tagEm[6].get_text()
item.money = tagTd[3].strong.get_text().replace(',', '')
item.firstPrize = tagTd[4].strong.get_text().replace(',', '')
item.secondPrize = tagTd[5].strong.get_text().replace(',', '')
items.append(item)
# cols=[]
# df = pd.DataFrame(js['record'], columns=cols)
return items
# 获取所有需要采集的页面
def getUrls(self):
urls = []
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list.html'
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'lxml')
tag = soup.find_all(re.compile('p'))[-1]
pages = tag.strong.get_text()
print('页面总数:' + pages)
for i in range(1, int(pages) + 1):
url = 'http://kaijiang.zhcw.com/zhcw/html/ssq/list_' + str(i) + '.html'
urls.append(url)
return urls
def getResponseContent(self, url):
user_agent = 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1) QQBrowser/6.0'
html = self.download(url, num_retries=4, user_agent=user_agent)
return html
# 下载页面内容
def download(self, url, user_agent='wswp', proxy=None, num_retries=3):
print("Downloading:%s" % url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {urllib.parse.urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
html = None
try:
html = opener.open(request).read()
html = html.decode('utf-8') # python3
except URLError as e:
if num_retries > 0:
print("Download error:(code:%s,reson:%s)" % (e.errno, e.reason))
html = None
if hasattr(e, 'code') and 500 <= e.code < 600:
print("Retrying .... ")
return self.download(url, user_agent, proxy, num_retries - 1)
return html
print('SSQ')
pass
if __name__ == '__main__':
GetDoubleColorBallNumber()
```