我的好朋友的毕业论文需要爬取基金经理的新闻数量,并且统计新闻数量与基金的成交率的关系,我当然义不容辞啦。
任务描述:爬取三百位基金经理“百度新闻”中的搜索结果,并且将其分别按月和按季度统计新闻数量。
beatifulsoup,urllib, request,python文件I/O
import url_manager,html_downloader,html_parser,html_output
import xlwt
import xlrd
import urllib
class SpiderMain(object):
def __init__(self):
self.urls=url_manager.UrlManager()
self.downloader=html_downloader.HtmlDownoader()
self.parser=html_parser.HtmlParser()
self.output=html_output.HtmlOutputer()
def craw(self,sheet1,sheet2,root_url,num,name):
count=1
listZeros=[0]
resultlistM=listZeros*((2016-2000)*12)
resultlistS = listZeros * ((2016 - 2000) * 4)
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
try:
new_url=self.urls.get_new_url()
print('crawling URL => %d ... : %s' % (count, new_url))
html_cont=self.downloader.download(new_url)
new_urls, resultlistM,resultlistS=self.parser.parse(new_url,html_cont,resultlistM,resultlistS)
self.urls.add_new_urls(new_urls)
#if count==100:
# break
count=count+1
except Exception as e:
print(e)
print('crawing failure')
#self.output.output_html()
self.output.collect_data(sheet1, resultlistM,name,num)
self.output.collect_data(sheet2, resultlistS, name, num)
if __name__=="__main__":
wb = xlwt.Workbook()
wsmonth = wb.add_sheet('month')
wsseason = wb.add_sheet('season')
A2016=list(range(201612,201600,-1))#2016年12月到2001年1月(月份)
A=A2016
for year in range(1,16):
A=A+[a-100*year for a in A2016]
for gap in range(len(A)):
wsmonth.write(0,gap+1,A[gap])
for Ygap in range(16): #季度
for Sgap in range(4):
if 16 - Ygap < 10:
B = "0" + str(16 - Ygap)
else:
B = str(16 - Ygap)
wsseason.write(0,Ygap*4+Sgap+1,"20" + B + "年第" + str(4 - Sgap) + "季度")
keywords=xlrd.open_workbook('keywords.xlsx')#295words
sh = keywords.sheet_by_index(0)
for num in range(295):
name=sh.cell(num,0).value
root_url = "http://news.baidu.com/ns?cl=2&rn=20&tn=news&word=" + urllib.parse.quote(name)
#root_url = "https://www.baidu.com/s?wd=" + urllib.parse.quote(name)
obj_spider = SpiderMain()
number=num+1
obj_spider.craw(wsmonth,wsseason, root_url,number,name)
wb.save('new_result.xls')
import urllib.request
import ssl
class HtmlDownoader(object):
ssl._create_default_https_context = ssl._create_unverified_context
def download(self, url):
if url is None:
return None
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = {
'User-Agent': user_agent}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
if response.getcode()!=200:
print(response.getcode())
return None
return response.read()
from bs4 import BeautifulSoup
import re
import urllib
class HtmlParser(object):
def parse(self, page_url, html_cont,resultlistM,resultlistS):
if page_url is None or html_cont is None:
return
soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
# print(soup)
new_urls=self._get_new_urls(page_url,soup)
new_resultlistM,new_resultlistS=self._get_new_data(resultlistM,resultlistS,soup)
return new_urls,new_resultlistM,new_resultlistS
def _get_new_urls(self, page_url, soup):
new_urls=set()
#< a href = "/ns?word=%E5%8D%9A%E6%97%B6%E6%9D%A8%E9%94%90&pn=60&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0" >
# < span class ="fk fkd" > < i class ="c-icon c-icon-bear-pn" > < / i > < / span > < span class ="pc" > 4 < / span > < / a >
# print(soup)
links=soup.find_all('a', href=re.compile(r"/ns\?word=.*pn=[^0].*"))
if links is None:
print("what's the fuck!")
for link in links:
new_url=link['href']
if new_url.endswith('-1') or new_url.endswith('1'):
continue
new_full_url=urllib.parse.urljoin(page_url,new_url)
new_urls.add(new_full_url)
return new_urls
def data_process(self,data):
data=re.sub("\D","",data)
real_data=int(data[0:6])
return real_data
def _get_new_data(self, resultlistM,resultlistS, soup):
res_data=[]
#res_data['url']=page_url
# Python
#title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find('h1')
nodes=soup.find_all('div',class_="result")
#< p class ="c-author" > 凤凰网 & nbsp; & nbsp;2014年07月29日 15:00 p >
for node in nodes:
time=node.find('p',class_="c-author")
realdata=self.data_process(time.get_text())
if realdata>201612 or realdata<200101:
print("time out of range")
continue
index=int((realdata-200100)/100)*12+((realdata-200100)%100)
index=16*12-index+1
index2=int((realdata-200100)/100)*4+int(((realdata-200100)%100)/4)
index2=16*4-index2+1
num=node.find('a',class_="c-more_link")
if num is None:
resultlistM[index] = resultlistM[index] + 1
resultlistS[index2] = resultlistM[index] + 1
else:
realnum=self.data_process(num.get_text())
resultlistM[index]=resultlistM[index]+realnum
resultlistS[index2] = resultlistM[index] + realnum
#summary_node=soup.find('div',class_="lemma-summary")
#res_data['summary']=summary_node.get_text()
return resultlistM,resultlistS
class HtmlOutputer(object):
def __init__(self):
self.datas=[]
def collect_data(self, sheet,resultlist,name,num):
count=1
sheet.write(num,0,name)
for result in resultlist:
sheet.write(num,count,result)
count = count + 1
#self.datas.append(data)