【python学习笔记】自动抓取雅虎新闻的内容

在雅虎新闻(http://news.yahoo.com/)搜索,过滤掉来源自雅虎新闻的新闻,提取在html源代码中包含的新闻正文,采用计算文段密度并提取最长文段为正文。对文本进行清洗,去除html标记、无用字段等垃圾,存成txt。再去除无效、过短等不符合质量要求的新闻,

存在的问题是一旦有http报错,就会终止程序,极大影响效率。

#coding:utf-8
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup

#提取网页正文,放入txt中
def remove_js_css (content):
    """ remove the the javascript and the stylesheet and the comment content ( and  ) """
    r = re.compile(r'''''',re.I|re.M|re.S)
    s = r.sub ('',content)
    r = re.compile(r'''''',re.I|re.M|re.S)
    s = r.sub ('', s)
    r = re.compile(r'''''', re.I|re.M|re.S)
    s = r.sub('',s)
    r = re.compile(r'''''', re.I|re.M|re.S)
    s = r.sub('',s)
    r = re.compile(r'''''', re.I|re.M|re.S)
    s = r.sub('',s)
    return s

def remove_empty_line (content):
    """remove multi space """
    r = re.compile(r'''^\s+$''', re.M|re.S)
    s = r.sub ('', content)
    r = re.compile(r'''\n+''',re.M|re.S)
    s = r.sub('\n',s)
    return s

def remove_any_tag (s):
    s = re.sub(r'''<[^>]+>''','',s)
    return s.strip()

def remove_any_tag_but_a (s):
    text = re.findall (r''']*>(.*?)''',s,re.I|re.S|re.S)
    text_b = remove_any_tag (s)
    return len(''.join(text)),len(text_b)

def remove_image (s,n=50):
    image = 'a' * n
    r = re.compile (r'''''',re.I|re.M|re.S)
    s = r.sub(image,s)
    return s

def remove_video (s,n=1000):
    video = 'a' * n
    r = re.compile (r'''''',re.I|re.M|re.S)
    s = r.sub(video,s)
    return s

def sum_max (values):
    cur_max = values[0]
    glo_max = -999999
    left,right = 0,0
    for index,value in enumerate (values):
        cur_max += value
        if (cur_max > glo_max) :
            glo_max = cur_max
            right = index
        elif (cur_max < 0):
            cur_max = 0

    for i in range(right, -1, -1):
        glo_max -= values[i]
        if abs(glo_max < 0.00001):
            left = i
            break
    return left,right+1

def method_1 (content, k=1):
    if not content:
        return None,None,None,None
    tmp = content.split('\n')
    group_value = []
    for i in range(0,len(tmp),k):
        group = '\n'.join(tmp[i:i+k])
        group = remove_image (group)
        group = remove_video (group)
        text_a,text_b= remove_any_tag_but_a (group)
        temp = (text_b - text_a) - 8 
        group_value.append (temp)
    left,right = sum_max (group_value)
    return left,right, len('\n'.join(tmp[:left])), len ('\n'.join(tmp[:right]))

def extract (content):
    content = remove_empty_line(remove_js_css(content))
    left,right,x,y = method_1 (content)
    return '\n'.join(content.split('\n')[left:right])

#输入url,将其新闻页的正文输入txt
def extract_news_content(web_url,file_name):
    request = urllib2.Request(web_url)  
  
    #在请求加上头信息,伪装成浏览器访问  
    request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')  
    opener = urllib2.build_opener()  
    html= opener.open(request).read()
    infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
    if html!=None and infoencode!=None:#提取内容不为空,error.或者用else
        html = html.decode(infoencode,'ignore')
        soup=BeautifulSoup(html)
        content=soup.renderContents()
        content_text=extract(content)#提取新闻网页中的正文部分,化为无换行的一段文字
        content_text= re.sub(" "," ",content_text)
        content_text= re.sub(">","",content_text)
        content_text= re.sub(""",'""',content_text)
        content_text= re.sub("<[^>]+>","",content_text)
        content_text=re.sub("\n"," ",content_text)
        file = open(file_name,'a')#append
        file.write(content_text)
        file.close()

#抓取yahoo新闻搜索结果:前10页,url:key_word=china%20usa
def search(key_word):
        #search_url='https://news.search.yahoo.com/search?fr=uh3_news_vert_gs&type=2button&p=key_word' 
        #req=urllib2.urlopen(search_url.replace('key_word',key_word))
        req=urllib2.urlopen('https://news.search.yahoo.com/search;_ylt=AwrSyCNrxf9U3hwA24nQtDMD?p=china+usa&pvid=m.2tmTk4LjEBh3aoU_0HThABNjAuMlT_hcj_3s7o&fr=uh3_news_vert_gs&fr2=sb-top-news.search.yahoo.com&type=2button&xargs=0&pstart=1&b=91')

        #循环抓取10页结果进行解析
        count_news=81
        for count in range(3):#前10页
                html=req.read()
                soup=BeautifulSoup(html)

                
                content  = soup.findAll("div", {"class": "res"}) #resultset object
                num = len(content)
                
                for i in range(num):
                        #先解析出来所有新闻的标题、来源、时间、url
                        p_str= content[i].find('a',{"class": "yschttl spt"}) #if no result then nontype object
                        contenttitle=p_str.renderContents()
                        contenttitle=contenttitle.decode('utf-8', 'ignore')#need it
                        contenttitle= re.sub("<[^>]+>","",contenttitle)
                        contentlink=str(p_str.get("href"))
                        contentauthor= content[i].find('span',{"class": "url"}).renderContents()
                        contentauthor=contentauthor.decode('utf-8', 'ignore')
                        contenttime=content[i].find('span',{"class": "timestamp"}).renderContents()
                        contenttime=contenttime.decode('utf-8', 'ignore')
                        #第i篇新闻,filename="D:\\Python27\\tiaozhanbei\\newscn\\%d.txt"%(i)
                        #file = open(filename,'w'),一个txt一篇新闻
                        file_name=r"D:\Python27\tiaozhanbei\newsen\%d.txt"%(count_news)
                        file = open(file_name,'w')#file是保留字吗?为什么可以用file这个变量(还是仅此一个变量)?
                        file.write(contenttitle.encode('utf-8'))
                        file.write(u'\n')
                        file.write(contentauthor.encode('utf-8'))
                        file.write(u'\n')
                        file.write(contenttime.encode('utf-8'))
                        file.write(u'\n'+contentlink+u'\n')
                        file.close()
                        #逐条新闻提取正文
                        if contentlink.find("yahoo.com")==-1 and contentlink!='http://readingeagle.com/ap/article/list-of-winners-of-the-world-press-photo-awards' and contentlink!='http://www.thepress-sentinel.com/pages/full_story_free/push?article-Bateman+returns++from+study+in+China%20&id=26507031&instance=lead_story' and contentlink!='http://onlyfans.cstv.com/schools/msu/sports/w-volley/spec-rel/021315aab.html':
                            extract_news_content(contentlink,file_name)#还写入文件
                            count_news+=1
                #解析下一页
                next_page=soup('a',{'href':True,'id':'pg-next'})[0]['href'] # search for the next page
                req=urllib2.urlopen(next_page)
                print next_page

if __name__=='__main__':
    key_word=raw_input('input key word:')
    search(key_word)


你可能感兴趣的:(【学习笔记】Python系列)