Python爬取百度新闻数据并将时间统计到EXCEL中

缘起

我的好朋友的毕业论文需要爬取基金经理的新闻数量,并且统计新闻数量与基金的成交率的关系,我当然义不容辞啦。
任务描述:爬取三百位基金经理“百度新闻”中的搜索结果,并且将其分别按月和按季度统计新闻数量。

使用到的技术

beatifulsoup,urllib, request,python文件I/O

Talk is cheap,show the code

主函数:GCWspider_main.py


import url_manager,html_downloader,html_parser,html_output
import xlwt
import xlrd
import urllib


class SpiderMain(object):
    def __init__(self):
        self.urls=url_manager.UrlManager()
        self.downloader=html_downloader.HtmlDownoader()
        self.parser=html_parser.HtmlParser()
        self.output=html_output.HtmlOutputer()
    def craw(self,sheet1,sheet2,root_url,num,name):
        count=1
        listZeros=[0]
        resultlistM=listZeros*((2016-2000)*12)
        resultlistS = listZeros * ((2016 - 2000) * 4)


        self.urls.add_new_url(root_url)
        while self.urls.has_new_url():
            try:
                new_url=self.urls.get_new_url()
                print('crawling URL => %d ... : %s' % (count, new_url))
                html_cont=self.downloader.download(new_url)
                new_urls, resultlistM,resultlistS=self.parser.parse(new_url,html_cont,resultlistM,resultlistS)
                self.urls.add_new_urls(new_urls)


                #if count==100:
                #    break
                count=count+1

            except Exception as e:
                print(e)
                print('crawing failure')

        #self.output.output_html()
        self.output.collect_data(sheet1, resultlistM,name,num)
        self.output.collect_data(sheet2, resultlistS, name, num)




if __name__=="__main__":

    wb = xlwt.Workbook()
    wsmonth = wb.add_sheet('month')
    wsseason = wb.add_sheet('season')

    A2016=list(range(201612,201600,-1))#2016年12月到2001年1月(月份)
    A=A2016
    for year in range(1,16):
        A=A+[a-100*year for a in A2016]
    for gap in range(len(A)):
        wsmonth.write(0,gap+1,A[gap])


    for Ygap in range(16):    #季度
        for Sgap in range(4):
            if 16 - Ygap < 10:
                B = "0" + str(16 - Ygap)
            else:
                B = str(16 - Ygap)
            wsseason.write(0,Ygap*4+Sgap+1,"20" + B + "年第" + str(4 - Sgap) + "季度")



    keywords=xlrd.open_workbook('keywords.xlsx')#295words
    sh = keywords.sheet_by_index(0)

    for num in range(295):
        name=sh.cell(num,0).value
        root_url = "http://news.baidu.com/ns?cl=2&rn=20&tn=news&word=" + urllib.parse.quote(name)

        #root_url = "https://www.baidu.com/s?wd=" + urllib.parse.quote(name)

        obj_spider = SpiderMain()
        number=num+1
        obj_spider.craw(wsmonth,wsseason, root_url,number,name)

    wb.save('new_result.xls')

下载器html_downloader.py


import urllib.request
import ssl


class HtmlDownoader(object):
    ssl._create_default_https_context = ssl._create_unverified_context
    def download(self, url):
        if url is None:
            return None
        user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
        headers = {
    'User-Agent': user_agent}
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        if response.getcode()!=200:
            print(response.getcode())
            return None
        return response.read()


解析器html_parser.py

from bs4 import BeautifulSoup
import re
import urllib

class HtmlParser(object):

    def parse(self, page_url, html_cont,resultlistM,resultlistS):
        if page_url is None or html_cont is None:
           return
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='utf-8')
        # print(soup)
        new_urls=self._get_new_urls(page_url,soup)
        new_resultlistM,new_resultlistS=self._get_new_data(resultlistM,resultlistS,soup)
        return new_urls,new_resultlistM,new_resultlistS

    def _get_new_urls(self, page_url, soup):
        new_urls=set()
        #< a href = "/ns?word=%E5%8D%9A%E6%97%B6%E6%9D%A8%E9%94%90&pn=60&cl=2&ct=1&tn=news&rn=20&ie=utf-8&bt=0&et=0" >
        # < span class ="fk fkd" > < i class ="c-icon c-icon-bear-pn" > < / i > < / span > < span class ="pc" > 4 < / span > < / a >
        # print(soup)
        links=soup.find_all('a', href=re.compile(r"/ns\?word=.*pn=[^0].*"))
        if links is None:
            print("what's the fuck!")
        for link in links:
            new_url=link['href']
            if new_url.endswith('-1') or new_url.endswith('1'):
                continue
            new_full_url=urllib.parse.urljoin(page_url,new_url)
            new_urls.add(new_full_url)
        return new_urls

    def data_process(self,data):
        data=re.sub("\D","",data)
        real_data=int(data[0:6])
        return real_data

    def _get_new_data(self, resultlistM,resultlistS, soup):
        res_data=[]
        #res_data['url']=page_url


        # 

Python

#title_node=soup.find('dd',class_="lemmaWgt-lemmaTitle-title").find('h1') nodes=soup.find_all('div',class_="result") #< p class ="c-author" > 凤凰网 & nbsp; & nbsp;2014年07月29日 15:00 for node in nodes: time=node.find('p',class_="c-author") realdata=self.data_process(time.get_text()) if realdata>201612 or realdata<200101: print("time out of range") continue index=int((realdata-200100)/100)*12+((realdata-200100)%100) index=16*12-index+1 index2=int((realdata-200100)/100)*4+int(((realdata-200100)%100)/4) index2=16*4-index2+1 num=node.find('a',class_="c-more_link") if num is None: resultlistM[index] = resultlistM[index] + 1 resultlistS[index2] = resultlistM[index] + 1 else: realnum=self.data_process(num.get_text()) resultlistM[index]=resultlistM[index]+realnum resultlistS[index2] = resultlistM[index] + realnum #summary_node=soup.find('div',class_="lemma-summary") #res_data['summary']=summary_node.get_text() return resultlistM,resultlistS

输出器html_output.py


class HtmlOutputer(object):
    def __init__(self):
        self.datas=[]

    def collect_data(self, sheet,resultlist,name,num):
        count=1
        sheet.write(num,0,name)
        for result in resultlist:
            sheet.write(num,count,result)
            count = count + 1
        #self.datas.append(data)

你可能感兴趣的:(爬虫,python,爬虫)