import requests # 获取网页源代码的库
import re # 正则表达式的库
import docx # 把数据写入word文档的库
headers = {‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400’}
def baidu(page):
num = (page-1)*10
url = 'https://www.baidu.com/s?ie=utf-8&medium=1&rtt=4&bsst=1&rsv_dl=news_b_pn&cl=2&wd=%E7%99%BD%E4%BA%91%E6%9C%BA%E5%9C%BA&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&x_bfe_rqs=03E80&tngroupname=organic_news&newVideo=12&pn=' + str(num)
res = requests.get(url, headers=headers).text
print(res)
p_href = '
p_title = '<.*?>(.*?)
'
title = re.findall(p_title, res, re.S)
print(title)
p_time1 = '(.*?)'
time1 = re.findall(p_time1, res, re.S)
print(time1)
p_source = '(.*?)'
source = re.findall(p_source, res, re.S)
print(source)
for i in range(len(href)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>', '', title[i])
print(title)
for i in range(len(href)):
time1[i] = re.sub('年', '-', time1[i])
time1[i] = re.sub('月', '-', time1[i])
time1[i] = re.sub('日', '', time1[i])
print(time1)
for i in range(len(href)):
print(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')')
print(href[i])
file = open('D:\\0exam\\news.txt', 'a', encoding='utf-8')
file.write('数据挖掘completed!' + '\n' + '\n')
for i in range(len(href)):
file.write(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')' + '\n')
file.write(href[i] + '\n')
file.write('——————————————————————————————' + '\n' + '\n')
file.close()
file1 = docx.Document('D:\\0exam\\report.docx')
for i in range(len(href)):
file1.add_paragraph(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')')
file1.add_paragraph(href[i] + '\n')
file1.save('D:\\0exam\\report.docx')
for i in range(35):
baidu(i+1)
print('第' + str(i+1) + '页爬取成功')
# ==============================
# 广州白云机场(股票代码600004)百度所有新闻(共35页)的爬取
# ==============================
# 一、获取网页源代码
# 基本库的引入
import requests # 获取网页源代码的库
import re # 正则表达式的库
import docx # 把数据写入word文档的库
# 请求头
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3775.400 QQBrowser/10.6.4209.400'}
# 要爬取的网页
def baidu(page):
num = (page-1)*10
url = 'https://www.baidu.com/s?ie=utf-8&medium=1&rtt=4&bsst=1&rsv_dl=news_b_pn&cl=2&wd=%E7%99%BD%E4%BA%91%E6%9C%BA%E5%9C%BA&tn=news&rsv_bp=1&oq=&rsv_btype=t&f=8&x_bfe_rqs=03E80&tngroupname=organic_news&newVideo=12&pn=' + str(num)
# 百度资讯搜索-按时间排序-全部资讯
# 获取的网页源代码
res = requests.get(url, headers=headers).text
# 打印网页源代码,在源代码中按Ctrl+F搜索网页上某个关键词,看是否获取了网页源代码
print(res)
# 二、获取网址、新闻标题、发布日期、来源
# 1.获取新闻网址的正则表达式
p_href = '', '', title[i])
print(title)
# 2.清洗发布日期
for i in range(len(href)):
time1[i] = re.sub('年', '-', time1[i])
time1[i] = re.sub('月', '-', time1[i])
time1[i] = re.sub('日', '', time1[i])
print(time1)
# 四、形成正式的新闻信息
for i in range(len(href)):
print(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')')
print(href[i])
# 五、把信息保存进TXT文档中
file = open('D:\\0exam\\news.txt', 'a', encoding='utf-8')
file.write('数据挖掘completed!' + '\n' + '\n')
for i in range(len(href)):
file.write(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')' + '\n')
file.write(href[i] + '\n')
file.write('——————————————————————————————' + '\n' + '\n')
file.close()
# 六、把数据保存进WORD中
# 把数据保存进word文档中,请先在电脑D盘下建立0exam的文件夹(你可以随意,不过路径D:\\0exam\\report.docx要跟随变动),里面创建一个report.docx的word文档(名字你也可以随意,不过也要做相应的改变),文档建立好以后,请在空白文档中敲一下空格,否则容易出现写入错误
file1 = docx.Document('D:\\0exam\\report.docx')
for i in range(len(href)):
file1.add_paragraph(str(i + 1) + '.' + title[i] + '(' + time1[i] + '-' + source[i] + ')')
file1.add_paragraph(href[i] + '\n')
file1.save('D:\\0exam\\report.docx')
# 七、爬取所有新闻,共有35页数据
for i in range(35):
baidu(i+1)
print('第' + str(i+1) + '页爬取成功')