#批量获取多家公司的百度新闻
from urllib import request,parse
import random
import time
from fake_useragent import UserAgent
import re
class BaiduNewsSpider(object):
def __init__(self):
self.url = 'https://www.baidu.com/s?tn=news7rtt=1&bsst=1&cl=2&wd={}'
#添加计数
self.i = 0
# 请求
def get_html(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode()
# 获取完之后直接调用解析函数
return html
# 解析
def parse_html(self,html):
# re_bds = '.*?title="(.*?)".*?class="star">(.*?).*?releasetime">(.*?)'
# pattern = re.compile(re_bds,re.S)
# r_list = pattern.findall(html)
# self.write_html(r_list)
pass
# 保存
def write_html(self,filename,html):
with open(filename,'w') as f:
f.write(html)
self.i += 1
# 主函数
def run(self):
companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for i in companys:
try:
wd = parse.quote(i)
url = self.url.format(wd)
html = self.get_html(url)
filename = '{}"百度新闻".html'.format(i)
self.write_html(filename, html)
# 随机休眠
time.sleep(random.uniform(1,2))
print(i + '百度新闻爬取成功')
except:
print(i + '百度新闻爬取失败')
print('数量:', self.i)
if __name__=='__main__':
start = time.time()
spider = BaiduNewsSpider()
spider.run()
end = time.time()
二、批量获取多家公司多页的百度新闻
1.批量获取数据
代码如下(示例):
#批量获取多家公司多页的百度新闻
from urllib import request,parse
import random
import time
import re
class BaiduNewsSpider(object):
def __init__(self):
self.url = 'https://www.baidu.com/s?tn=news7rtt=4&bsst=1&cl=2&wd={}&pn={}'
# 按时间排序https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd=%E4%B8%87%E7%A7%91%E9%9B%86%E5%9B%A2&medium=0
#添加计数
self.i = 0
# 请求
def get_html(self,url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read().decode()
# 获取完之后直接调用解析函数
return html
# 解析
def parse_html(self,html):
# #r_list:[('月光宝盒','周星驰','1994'),(),()
# re_bds = '.*?title="(.*?)".*?class="star">(.*?).*?releasetime">(.*?)'
# pattern = re.compile(re_bds,re.S)
# r_list = pattern.findall(html)
# # 直接调用写入函数
# self.write_html(r_list)
pass
# 保存
def write_html(self,filename,html):
with open(filename,'w') as f:
f.write(html)
self.i += 1
# 主函数
def run(self):
companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for company in companys:
wd = parse.quote(company)
for i in range(1, 4):
pn = (i - 1) * 10
url = self.url.format(wd, pn)
html = self.get_html(url)
filename = '{}百度新闻-第{}页.html'.format(company, i)
self.write_html(filename, html)
print('第%d页抓取成功' % i)
# 每爬取1个页面随机休眠1-3秒
time.sleep(random.randint(1, 3))
print(company + '百度新闻爬取成功')
print('数量:', self.i)
if __name__=='__main__':
start = time.time()
spider = BaiduNewsSpider()
spider.run()
end = time.time()
2.数据清洗
代码如下(示例):
import requests
import re
headers = {
'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
url = 'https://www.sogou.com/sogou?query=%E9%98%BF%E9%87%8C%E5%B7%B4%E5%B7%B4&interation=1728053249&pid=sogou-wsse-9fc36fa768a74fa9&ie=utf8&w=&sut=6046&sst0=1612509795700&lkt=1%2C1612509795594%2C1612509795594'
# timeout访问超时设置,访问超过指定时间就报出异常,访问结束
res = requests.get(url, headers=headers,timeout=10).text
# 正则表达式获取数据
re_bds = '.*?(.*?).*?.*?.*?(.*?)'
pattern = re.compile(re_bds, re.S)
r_list = pattern.findall(res)
# 数据清洗
item = {
}
for r in r_list:
item['id'] = re.sub('<.*?>', '', r[0].strip())
item['网址'] = re.sub('<.*?>', '', r[1].strip())
item['标题'] = re.sub('<.*?>', '', r[2].strip())
item['发布时间'] = re.sub('<.*?>', '', r[3].strip())
print(item)
print('*' * 50)
3.将1.2.合并
代码如下(示例):
#批量获取多家公司多页的百度新闻并清洗数据
from urllib import request,parse
import requests
import random
import time
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
def Sougou(company):
url = 'https://www.sogou.com/sogou?ie=utf8&p=40230447&interation=1728053249&interV=&pid=sogou-wsse-8f646834ef1adefa&query=' + company + 'shid=djt1'
url = url.format(parse.quote(company))
res = requests.get(url, headers=headers, timeout=10).text
#print(res)
# 编写正则表达式提取数据
p_title = '.*?(.*?)'
title = re.compile(p_title, re.S).findall(res)
p_href = '.*?'
href = re.compile(p_href, re.S).findall(res)
p_date = '.*?.*?(.*?)'
date = re.compile(p_date, re.S).findall(res)
print(title,href,date)
# 数据清洗及打印输出
for i in range(len(title)):
title[i] = re.sub('<.*?>','',title[i])
title[i] = re.sub('&.*?;', '', title[i])
date[i] = re.sub('<.*?>', '', date[i])
print(str(i+1) + '.' + title[i] +'-' + date[i])
print(href[i])
companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for i in companys:
Sougou(i)
# 每爬取1个页面随机休眠1-3秒
time.sleep(random.randint(1, 3))
print(i + '百度新闻爬取成功')
三、批量获取多家公司的新浪财经新闻
#批量获取多家公司多页的新浪财经新闻并清洗数据---这里公司名称不能直接用中文
from urllib import request,parse
import requests
import random
import time
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36'}
def Sina(company):
url = 'https://search.sina.com.cn/?q={}&c=news&from=channel'
url = url.format(parse.quote(company))
res = requests.get(url, headers=headers, timeout=10).text
# print(res)
# 编写正则表达式提取数据
p_title = '(.*?)'
title = re.compile(p_title, re.S).findall(res)
p_href = ''
href = re.compile(p_href, re.S).findall(res)
p_date = '(.*?)
'
date = re.compile(p_date, re.S).findall(res)
#print(title,href,date)
# 数据清洗及打印输出
for i in range(len(title)):
title[i] = re.sub('<.*?>','',title[i])
title[i] = re.sub('&.*?;', '', title[i])
date[i] = re.sub('<.*?>', '', date[i])
print(str(i+1) + '.' + title[i] +'-' + date[i])
print(href[i])
companys = ['华能信托','阿里巴巴','百度集团','万科集团','腾讯','京东']
for i in companys:
Sina(i)
# 每爬取1个页面随机休眠1-3秒
time.sleep(random.randint(1, 3))
print(i + '新浪财经新闻爬取成功')
总结
以上是爬虫实战的内容,后续将继续学习python金融数据分析。