前段时间写了如何用Python进行简单的数据挖掘,主要是从百度和新浪财经挖掘公司的新闻数据详情请看这里
基于Python的简单数据挖掘
今天我们在这个的基础上再进行数据去重,筛选还有舆情分,因为要去重所我们把数据放到了MySQL上面,然后根据标婷进行简单的去重代码如下
select_sql = 'SELECT * FROM test WHERE title = %s'
cur.execute(select_sql, title)
if len(data_all) == 0:#表示没有重复数据
然后去重之后再根据关键词进行舆情筛选关键词是
keywords = ['违约', '诉讼', '兑付']#舆论关键词
然后如果标题或者文章正文出现这些关键词的话就扣5分,代码如下
num = 0
article = requests.get(href).text
article = encoed(article)
p_article = '(.*?)
'
article_main = re.findall(p_article, article, re.S)
article = ''.join(article_main)
for k in keywords:
if (k in article) or (k in title):
num -= 5
这样就OK了,当然关键词可以自行修改这里就不累赘,然后可以通过查询数据库获取每天某家公司的舆情分,然后看这个分与股票的关系,当然这里只是简单的一个判断,真的数据分析还会涉及很多金融知识,我只是一个喜欢金融的程序员而已还做不到那个地步,以下是全部代码,需要的小伙伴可以拿去运行看看,里面对时间的统一还有乱码都进行了处理
import requests
import re
import pymysql
import time
#headers模拟浏览器打开网站,User-Agent的值可以通过在浏览器直接输入chrome://version/获取,其中的用户代理就是
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36'}
sql = 'INSERT INTO test (company, title, href, date, source, score) VALUES (%s, %s, %s, %s, %s, %s)'
select_sql = 'SELECT * FROM test WHERE title = %s'
get_score_sql = 'SELECT score FROM test WHERE company LIKE %s AND date LIKE %s'
companys = ['华能信托', '阿里巴巴', '万科集团', '百度集团', '腾讯', '京东']#爬取的公司名字,可增减
keywords = ['违约', '诉讼', '兑付']#舆论关键词
def getScore(company):
today = time.strftime('%Y-%m-%d')
db = pymysql.connect(host='localhost', port=3308, user='root', password='root', database='pachong', charset='utf8')
cur = db.cursor()
cur.execute(get_score_sql, (company, today))
data = cur.fetchall()
score = 100
for i in data:
score += i[0]
cur.close()
db.close()
print(company + ': ' + str(score))
#格式化时间
def initDate(date):
date = date.split(' ')[0]
date = re.sub('年', '-', date)
date = re.sub('月', '-', date)
date = re.sub('日', '-', date)
if ('小时' in date) or ('分钟' in date):
date = time.strftime("%Y-%m-%d")
else:
date = date
return date
#添加数据到数据库
def addData(sql, company, title, href, date, source):
try:
num = 0
article = requests.get(href).text
article = encoed(article)
p_article = '(.*?)
'
article_main = re.findall(p_article, article, re.S)
article = ''.join(article_main)
for k in keywords:
if (k in article) or (k in title):
num -= 5
except:
article = '单个新闻爬取失败'
company_re = company[0] + '.{0,5}' + company[-1]
if (company in title) or (len(re.findall(company_re, article)) >1):
db = pymysql.connect(host='localhost', port=3308, user='root', password='root', database='pachong',charset='utf8')
cur = db.cursor()
cur.execute(select_sql, title)
data_all = cur.fetchall()
if len(data_all) == 0:
cur.execute(sql, (company, title, href, initDate(date), source, num))
db.commit()
cur.close()
db.close()
#乱码处理
def encoed(res):
try:
res = res.encoed('ISO-8859-1').edcode('utf-8')
except:
try:
res = res.encoed('ISO-8859-1').decode('gbk')
except:
res = res
return res
#从新浪财经获取新闻数据
def sina(company):
url='https://search.sina.com.cn/?q=' + company + '&c=news&range=all&'
res =requests.get(url, headers=headers, timeout=20).text
res = encoed(res)
p_href = ''
p_title = '(.*?)'
p_date = '(.*?)'
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
date = re.findall(p_date, res, re.S)
# file2 = open('E:\\新浪数据挖掘报告.txt', 'a') # 以追加的方式打开文件,如果不存在就新建
# file2.write(company + '数据挖掘' + '\n' + '\n')
for i in range(len(title)):
title[i] = re.sub('<.*?>', '', title[i])
date[i] = date[i].split(' ')[1]
# file2.write(str(i + 1) + '.' + title[i] + '-' + date[i] + '\n')
# file2.write(href[i] + '\n' + '\n')
addData(sql, company, title[i], href[i], date[i], 'sina')
# file2.write('----------------------------------------------------------------' + '\n' + '\n')
# file2.close()
#从百度获取新闻资讯并且按时间排序
def baidu(company):
url = "https://www.baidu.com/s?rtt=4&bsst=1&cl=2&tn=news&word="+company #要爬取的数据网站rtt=1的话就是默认排序,爬取其他网站的话就自行修改网址和筛序规则就行
res = requests.get(url, headers=headers, timeout=20).text
res = encoed(res)
p_href = '.*? #标题链接的提取,每个网站不一定一样可以通过在网页上右键查看源码的方式获取然后通过正则表达式去筛选
p_title = '.*?>(.*?)
'
#标题提取,方式和链接一样
p_info = ' '#时间和来源提取
href = re.findall(p_href, res, re.S)
title = re.findall(p_title, res, re.S)
info = re.findall(p_info, res, re.S)
source = [] #保存来源
date = [] #保存时间
# file1 = open('E:\\数据挖掘报告.txt', 'a') # 以追加的方式打开文件,如果不存在就新建
# file1.write(company + '数据挖掘' + '\n' + '\n')
for i in range(len(title)):
title[i] = title[i].strip()
title[i] = re.sub('<.*?>', '', title[i])
info[i] = re.sub('<.*?>', '', info[i])
source.append(info[i].split(' ')[0])
date.append(info[i].split(' ')[1])
source[i] = source[i].strip()
date[i] = date[i].strip()
addData(sql, company, title[i], href[i], date[i], source[i])
# file1.write(str(i + 1) + '.' + title[i] + '(' + date[i] + '-' + source[i] + ')' + '\n')
# file1.write(href[i] + '\n' + '\n')
# file1.write('----------------------------------------------------------------' + '\n' + '\n')
# file1.close()
#可能会出现异常,所以进行一下异常处理
for i in companys:
try:
sina(i)
baidu(i)
print(i + '新闻爬取成功')
getScore(i)
except:
print(i + '新闻爬取失败')