在雅虎新闻(http://news.yahoo.com/)搜索,过滤掉来源自雅虎新闻的新闻,提取在html源代码中包含的新闻正文,采用计算文段密度并提取最长文段为正文。对文本进行清洗,去除html标记、无用字段等垃圾,存成txt。再去除无效、过短等不符合质量要求的新闻,
存在的问题是一旦有http报错,就会终止程序,极大影响效率。
#coding:utf-8
import re
import urllib2
import chardet
from BeautifulSoup import BeautifulSoup
#提取网页正文,放入txt中
def remove_js_css (content):
""" remove the the javascript and the stylesheet and the comment content ( and ) """
r = re.compile(r'''''',re.I|re.M|re.S)
s = r.sub ('',content)
r = re.compile(r'''''',re.I|re.M|re.S)
s = r.sub ('', s)
r = re.compile(r'''''', re.I|re.M|re.S)
s = r.sub('',s)
r = re.compile(r'''''', re.I|re.M|re.S)
s = r.sub('',s)
r = re.compile(r'''''', re.I|re.M|re.S)
s = r.sub('',s)
return s
def remove_empty_line (content):
"""remove multi space """
r = re.compile(r'''^\s+$''', re.M|re.S)
s = r.sub ('', content)
r = re.compile(r'''\n+''',re.M|re.S)
s = r.sub('\n',s)
return s
def remove_any_tag (s):
s = re.sub(r'''<[^>]+>''','',s)
return s.strip()
def remove_any_tag_but_a (s):
text = re.findall (r''']*>(.*?)''',s,re.I|re.S|re.S)
text_b = remove_any_tag (s)
return len(''.join(text)),len(text_b)
def remove_image (s,n=50):
image = 'a' * n
r = re.compile (r'''''',re.I|re.M|re.S)
s = r.sub(image,s)
return s
def remove_video (s,n=1000):
video = 'a' * n
r = re.compile (r'''''',re.I|re.M|re.S)
s = r.sub(video,s)
return s
def sum_max (values):
cur_max = values[0]
glo_max = -999999
left,right = 0,0
for index,value in enumerate (values):
cur_max += value
if (cur_max > glo_max) :
glo_max = cur_max
right = index
elif (cur_max < 0):
cur_max = 0
for i in range(right, -1, -1):
glo_max -= values[i]
if abs(glo_max < 0.00001):
left = i
break
return left,right+1
def method_1 (content, k=1):
if not content:
return None,None,None,None
tmp = content.split('\n')
group_value = []
for i in range(0,len(tmp),k):
group = '\n'.join(tmp[i:i+k])
group = remove_image (group)
group = remove_video (group)
text_a,text_b= remove_any_tag_but_a (group)
temp = (text_b - text_a) - 8
group_value.append (temp)
left,right = sum_max (group_value)
return left,right, len('\n'.join(tmp[:left])), len ('\n'.join(tmp[:right]))
def extract (content):
content = remove_empty_line(remove_js_css(content))
left,right,x,y = method_1 (content)
return '\n'.join(content.split('\n')[left:right])
#输入url,将其新闻页的正文输入txt
def extract_news_content(web_url,file_name):
request = urllib2.Request(web_url)
#在请求加上头信息,伪装成浏览器访问
request.add_header('User-Agent','Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')
opener = urllib2.build_opener()
html= opener.open(request).read()
infoencode = chardet.detect(html)['encoding']##通过第3方模块来自动提取网页的编码
if html!=None and infoencode!=None:#提取内容不为空,error.或者用else
html = html.decode(infoencode,'ignore')
soup=BeautifulSoup(html)
content=soup.renderContents()
content_text=extract(content)#提取新闻网页中的正文部分,化为无换行的一段文字
content_text= re.sub(" "," ",content_text)
content_text= re.sub(">","",content_text)
content_text= re.sub(""",'""',content_text)
content_text= re.sub("<[^>]+>","",content_text)
content_text=re.sub("\n"," ",content_text)
file = open(file_name,'a')#append
file.write(content_text)
file.close()
#抓取yahoo新闻搜索结果:前10页,url:key_word=china%20usa
def search(key_word):
#search_url='https://news.search.yahoo.com/search?fr=uh3_news_vert_gs&type=2button&p=key_word'
#req=urllib2.urlopen(search_url.replace('key_word',key_word))
req=urllib2.urlopen('https://news.search.yahoo.com/search;_ylt=AwrSyCNrxf9U3hwA24nQtDMD?p=china+usa&pvid=m.2tmTk4LjEBh3aoU_0HThABNjAuMlT_hcj_3s7o&fr=uh3_news_vert_gs&fr2=sb-top-news.search.yahoo.com&type=2button&xargs=0&pstart=1&b=91')
#循环抓取10页结果进行解析
count_news=81
for count in range(3):#前10页
html=req.read()
soup=BeautifulSoup(html)
content = soup.findAll("div", {"class": "res"}) #resultset object
num = len(content)
for i in range(num):
#先解析出来所有新闻的标题、来源、时间、url
p_str= content[i].find('a',{"class": "yschttl spt"}) #if no result then nontype object
contenttitle=p_str.renderContents()
contenttitle=contenttitle.decode('utf-8', 'ignore')#need it
contenttitle= re.sub("<[^>]+>","",contenttitle)
contentlink=str(p_str.get("href"))
contentauthor= content[i].find('span',{"class": "url"}).renderContents()
contentauthor=contentauthor.decode('utf-8', 'ignore')
contenttime=content[i].find('span',{"class": "timestamp"}).renderContents()
contenttime=contenttime.decode('utf-8', 'ignore')
#第i篇新闻,filename="D:\\Python27\\tiaozhanbei\\newscn\\%d.txt"%(i)
#file = open(filename,'w'),一个txt一篇新闻
file_name=r"D:\Python27\tiaozhanbei\newsen\%d.txt"%(count_news)
file = open(file_name,'w')#file是保留字吗?为什么可以用file这个变量(还是仅此一个变量)?
file.write(contenttitle.encode('utf-8'))
file.write(u'\n')
file.write(contentauthor.encode('utf-8'))
file.write(u'\n')
file.write(contenttime.encode('utf-8'))
file.write(u'\n'+contentlink+u'\n')
file.close()
#逐条新闻提取正文
if contentlink.find("yahoo.com")==-1 and contentlink!='http://readingeagle.com/ap/article/list-of-winners-of-the-world-press-photo-awards' and contentlink!='http://www.thepress-sentinel.com/pages/full_story_free/push?article-Bateman+returns++from+study+in+China%20&id=26507031&instance=lead_story' and contentlink!='http://onlyfans.cstv.com/schools/msu/sports/w-volley/spec-rel/021315aab.html':
extract_news_content(contentlink,file_name)#还写入文件
count_news+=1
#解析下一页
next_page=soup('a',{'href':True,'id':'pg-next'})[0]['href'] # search for the next page
req=urllib2.urlopen(next_page)
print next_page
if __name__=='__main__':
key_word=raw_input('input key word:')
search(key_word)