《Python金融大数据挖掘与分析全流程详解》实战 笔记整理

1、获取新浪财经实时股票数据

# =============================================================================
# 9.1 新浪股票实时数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
browser = webdriver.Chrome(chrome_options=chrome_options)
browser.get('http://finance.sina.com.cn/realstock/company/sh000001/nc.shtml')
data = browser.page_source
# print(data)
browser.quit()
#提取股价的正则表达式
p_price = '
(.*?)
' price = re.findall(p_price, data) print(price)

2、东方财富网数据挖掘实战

# =============================================================================
# 9.2 东方财富网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re


def dongfang(company):
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless')
    browser = webdriver.Chrome(chrome_options=chrome_options)
    url = 'http://so.eastmoney.com/news/s?keyword=' + company
    browser.get(url)
    #获得网页数据,存在data里
    data = browser.page_source
    browser.quit()
    # print(data)

    p_title = '

(.*?)' p_href = '

.*?' p_date = '

(.*?)

' #从data里提取标题、链接、日期 title = re.findall(p_title,data) href = re.findall(p_href,data) date = re.findall(p_date,data,re.S) for i in range(len(title)): title[i] = re.sub('<.*?>', '', title[i]) date[i] = date[i].split(' ')[0] #整合之后输出 print(str(i+1) + '.' + title[i] + ' - '+ date[i]) print(href[i]) companys = ['华能信托', '阿里巴巴', '腾讯', '京东', '万科'] for i in companys: try: dongfang(i) print(i + '该公司东方财富网爬取成功') except: print(i + '该公司东方财富网爬取失败')

3、裁判文书网:自动在网页上搜索

# =============================================================================
# 9.3 裁判文书网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import time
browser = webdriver.Chrome()
browser.get('http://wenshu.court.gov.cn/')
browser.maximize_window()

browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').clear()  # 清空原搜索框
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[2]/input').send_keys('房地产')  # 在搜索框内模拟输入'房地产'三个字
browser.find_element_by_xpath('//*[@id="_view_1540966814000"]/div/div/div[3]').click()  # 点击搜索按钮
time.sleep(10)  # 如果还是获取不到你想要的内容,你可以把这个时间再稍微延长一些
data = browser.page_source
browser.quit()
print(data)

4、巨潮资讯网:多个指定关键词的公告信息批量爬取

# =============================================================================
# 9.4 巨潮资讯网数据挖掘实战 by 王宇韬
# =============================================================================

from selenium import webdriver
import re

def juchao(keyword):
    browser = webdriver.Chrome()
    url = 'http://www.cninfo.com.cn/new/fulltextSearch?notautosubmit=&keyWord=' + keyword
    browser.get(url)
    data = browser.page_source
    # print(data)
    browser.quit()

    p_title = '(.*?)'
    p_href = '.*?'
    p_date = '
(.*?)
' title = re.findall(p_title, data) href = re.findall(p_href, data) date = re.findall(p_date, data) for i in range(len(title)): title[i] = re.sub(r'<.*?>', '', title[i]) href[i] = 'http://www.cninfo.com.cn' + href[i] href[i] = re.sub('amp;', '', href[i]) date[i] = date[i].split(' ')[0] print(str(i + 1) + '.' + title[i] + ' - ' + date[i]) print(href[i]) keywords = ['理财', '现金管理', '纾困'] for i in keywords: juchao(i)

 

你可能感兴趣的:(爬虫,Python学习)