使用 selenium webdriver 抓取新浪财经24小时快讯

抓取新浪网24小时快讯,同时存入数据表与文件中。 程序使用了 selenium webdriver 模拟鼠标下拉行为获取页面内容,可惜本机测试只抓到了10天数据,新浪太小气。本程序修改一下对抓取其他下拉式网页有一定参考价值。代码git https://github.com/dreamnotover/crawler_example

sinanew.py

# -*- coding:utf-8 -*- 
from lxml.html import etree
import  os
import pymysql
import logging
import random  
from selenium import webdriver
 
import json
import time 
from time import sleep
 
from  Config  import *
# 通过下面的方式进行简单配置输出方式与日志级别
import logging
import  datetime
global rows
global values
url='http://finance.sina.com.cn/7x24/?tag=10'
basedir=os.path.dirname(os.path.realpath(__file__))
logfile="%s\\log\\sina_%s.log" %(basedir,datetime.datetime.now().strftime("%Y-%m-%d"))   
logging.basicConfig(filename=logfile, level=logging.DEBUG,format='%(asctime)s  %(message)s')

logfilename="%s\\DATA\\sina_%s.sql" %(basedir,datetime.datetime.now().strftime("%Y-%m-%d"))  
 
start = time.time()
infor="Import all  stocks begins!"
logging.info(infor)     
rows=0
values=[] 
header="INSERT INTO `sina_news` (`n_date`, `n_time`, `content`) VALUES" 
middle="(%s, %s, %s)"

query =header+middle


def write_content_to_file(filename,message, encoding='utf-8'):
    file_object = open(filename,'a+')
    try:
        #file_object.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\t'+message+'\n')
        file_object.write(message+'\n')
    finally:
        file_object.close()
    
def writeFile(message):
    file_object = open(logfilename,'w+', encoding='utf-8')
    try:
        #file_object.write(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\t'+message+'\n')
        file_object.write(message+'\n')
    finally:
        file_object.close()
        
 
        
    
def  insert_query(query,values):
 
    if  len(values) > 0 :
        sqlstr=json.JSONEncoder().encode(values)
        sqlstr=sqlstr.replace('[', '(').replace(']',')').replace('((', '(').replace('))',')')
        sqlstr= header+sqlstr
        writeFile(sqlstr) 
        connection = pymysql.connect(host=HOST,port=int(PORT),user=USERNAME,passwd=PASSWORD,db=DATABASE,charset='utf8')
        mycursor= connection.cursor()
        
        try:
            # Execute the SQL command
           
            mycursor.executemany(query, values)
            connection.commit()
            affected_rows = mycursor.rowcount
            if affected_rows:
                print("Number of rows affected : {}".format(affected_rows))
                logging.info("Number of rows affected : {}".format(affected_rows))
            else:
                print('0 row  affected!')
                logging.info('0 row  affected!') 
            values=[]
            sqlstr=""
        except  Exception as e:
            #writeFile("Exeception occured:{}".format(e))
            print("Exeception occured:{}".format(e))
            logging.error("Exeception occured:{}".format(e))
            connection.rollback()
        finally:
            mycursor.close()
            connection.close()
 
 
def scroll_top(driver):
    if driver.name == "chrome":
        js = "var q=document.body.scrollTop=0"
    else:
        js = "var q=document.documentElement.scrollTop=0"
    return driver.execute_script(js)
         
 
def scroll_foot(driver):
    if driver.name == "chrome":
        js = "var q=window.scrollTo(0, document.body.scrollHeight);"
    else:
        js = "var q=document.documentElement.scrollTop=100000"
    return driver.execute_script(js)
         
def write_text(filename, info):
    """  
    :param info:  
    :return: none
    """
    #  
    with open(filename, 'a+') as fp:  
        fp.write(str(info))
        fp.write('\n')
        fp.write('\n') 
     
def sroll_multi(driver):
    loopCounter = 0
    lastHeight = driver.execute_script("return document.body.scrollHeight")
    while True:
        if loopCounter >5500:
            break; # if the account follows a ton of people, its probably a bot, cut it off
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(random.randint(2,9))
        newHeight = driver.execute_script("return document.body.scrollHeight")
        if newHeight == lastHeight:
            break
        lastHeight = newHeight
        loopCounter = loopCounter + 1
 
 
 
            
options = webdriver.ChromeOptions()  
   
options.add_argument("--headless")  
driver = webdriver.Chrome(chrome_options = options,executable_path='D:\\chromedriver.exe')
driver.implicitly_wait(30)
driver.maximize_window()
   
driver.implicitly_wait(10)
# driver.maximize_window()
driver.get(url)
sroll_multi(driver)  
  
data_secs = driver.find_elements_by_xpath('//*[@id="liveList01"]/div')
for data_sec  in  data_secs:
    day = data_sec.get_attribute('data-time')
    hour_min= data_sec.find_element_by_class_name('bd_i_time_c')
    content= data_sec.find_element_by_class_name('bd_i_txt_c')
    print(day,content.text)
     
    filename="%s\\data\\sina24x_%s.txt" %(basedir,day)
    
    value=[  day , hour_min.text, content.text]
    filecontent= hour_min.text+'\t'+ content.text
    write_content_to_file(filename, filecontent)
    values.append(value)
  
    rows=rows+1
    if (rows % 50 == 0):
        insert_query(query,values)
        values=[]
if len(values) > 0 :            
    insert_query(query,values)
    values=[]

 

config.py

# please change according to your own settings 
#HOST= "192.168.101.10"
#HOST= "192.168.101.186"
HOST= "192.168.101.74"
PORT="3306"
USERNAME = "root"
PASSWORD = "123456"
DATABASE = "stockcn"
#DATABASE = "fin_predict_db"
DBCHARSET="utf8"
 
 

你可能感兴趣的:(python,量化金融)