爬取中国新闻网滚动新闻:
滚动新闻-中国新闻网-梳理天下新闻 (chinanews.com.cn)将上述中国新闻网滚动新闻第一页的新闻内容爬下来,保存到txt文件中(可以统一保存到一个,也可以每个新闻保存一个)。
新闻第一页共有125条新闻,首先先获取一下所有新闻的超链接:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html = urlopen('https://www.chinanews.com.cn/scroll-news/news1.html')
bs = BeautifulSoup(html,'html.parser')
i=0
for link in bs.find('div',{"id":"content_right"}).find_all('a',href=re.compile("[a-z]*/[a-z]*[0-9]*/*[a-z]*-*[a-z]*/2022/04-08/[a-z]*[0-9]*\.shtml")):
if 'href' in link.attrs:
print("https://www.chinanews.com.cn/"+link.attrs['href'])
i+=1
print(i)
输出:
将超链接存储在列表里:
newsPages=[]#存储超链接的列表
for link in bs.find('div',{"id":"content_right"}).find_all('a',href=re.compile("[a-z]*/[a-z]*[0-9]*/*[a-z]*-*[a-z]*/2022/04-08/[a-z]*[0-9]*\.shtml")):
if 'href' in link.attrs:
url = "https://www.chinanews.com.cn/"+link.attrs['href']
newsPages.append(url)
定义获取网页超链接函数:
def getNewsPage(url):
bs = BeautifulSoup(url,'html.parser')
return bs.text
如果想要获取网页源代码(包括新闻内容),则做一下更改:
def getNewsPage(url):
html = urlopen(url)
bs = BeautifulSoup(html,'html.parser')
bs.encode("utf-8")
return bs.text
定义文件保存函数:
def saveNews(newsPage,filename):
f=open(filename,"wb")
f.write(newsPage.encode("utf-8"))
f.close()
给文件命名并保存所有网页源代码:
#newsPage=0
for pageUrl in newsPages:
html = getNewsPage(pageUrl)
filename = pageUrl[pageUrl.rindex("/")+1:pageUrl.rindex(".")]
#filename=str(newsPage)+'.'+'txt'
saveNews(html,filename)
newsPage+=1
分开保存所有文件超链接:
newsPage=0
for pageUrl in newsPages:
html = getNewsPage(pageUrl)
#filename = pageUrl[pageUrl.rindex("/")+1:pageUrl.rindex(".")]
filename=str(newsPage)+'.'+'txt'
saveNews(html,filename)
newsPage+=1
完整代码如下:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
def getNewsPage(url):
html = urlopen(url)
bs = BeautifulSoup(html,'html.parser')
bs.encode("utf-8")
return bs.text
def saveNews(newsPage,filename):
f=open(filename,"wb")
f.write(newsPage.encode("utf-8"))
f.close()
html = urlopen('https://www.chinanews.com.cn/scroll-news/news1.html')
bs = BeautifulSoup(html,'html.parser')
newsPages=[]#存储超链接的列表
for link in bs.find('div',{"id":"content_right"}).find_all('a',href=re.compile("[a-z]*/[a-z]*[0-9]*/*[a-z]*-*[a-z]*/2022/04-08/[a-z]*[0-9]*\.shtml")):
if 'href' in link.attrs:
url = "https://www.chinanews.com.cn/"+link.attrs['href']
newsPages.append(url)
newsPage=0
for pageUrl in newsPages:
html = getNewsPage(pageUrl)
print(html)
#filename = pageUrl[pageUrl.rindex("/")+1:pageUrl.rindex(".")]
filename=str(newsPage)+'.'+'txt'
saveNews(html,filename)
newsPage+=1
生成的文件内容为:
超链接:
如果不用BeautifulSoup,可以这么写,但是会有一定的小瑕疵
比如它搜索到的信息会有127条,莫名多出两条。
import requests
import re
def getnews(url):
aes = requests.get(url)
aes.encoding = "utf-8"
return aes.text
def savepage(page, filename):
f = open(filename, "wb")
f.write(page.encode("utf-8"))
f.close()
url = "https://www.chinanews.com.cn/scroll-news/news1.html"
res = requests.get(url)
res.encoding='utf-8'
g = re.findall("[a-z]*/[a-z]*[0-9]*/*[a-z]*-*[a-z]*/2022/04-08/[a-z]*[0-9]*\.shtml",res.text)
#print(len(g))
for i in range(0, len(g)):
g[i]="https://www.chinanews.com.cn/"+g[i]
for u in g:
pc = getnews(u)
filename = u[u.rindex("/")+1:u.rindex(".")]
savepage(pc, filename)
print(u+"===>"+filename)
输出: