webdriver

import requests
import os
import pymysql
import uuid
import re
from selenium import webdriver
from bs4 import BeautifulSoup


def getHeaders():
    headers = {
        'User-Agent': 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1; 125LA; .NET CLR 2.0.50727; .NET CLR 3.0.04506.648; .NET CLR 3.5.21022)',
    }
    return headers

def fillList(infos,i):
    try:
        infos['title']=i.find_element_by_xpath('./div[1]/div/div/a').text
        infos['url']=i.find_element_by_xpath('./div[1]/div/div/a').get_attribute("href")
        print('----',infos['url'])
        #保存二级页面
        resp_detail = requests.get(url=infos['url'],headers=getHeaders())
        content_detail = resp_detail.content.decode('utf-8')
        content=re.findall(r'

.*

',content_detail) ''.join(content) content = re.sub(r'', '', content[0]) infos['html'] = content #保存图片 img_url=i.find_element_by_xpath('./div[2]/a/img').get_attribute("src") resp_img = requests.get(url=img_url) content_img = resp_img.content currentPathName = os.getcwd() parentPathName = os.path.abspath(os.path.join(currentPathName, os.pardir)) folder_path = parentPathName + "/image" + "/" if not os.path.exists(folder_path): os.makedirs(folder_path) img_name = str(uuid.uuid1()) + '.jpg' filename = '%s%s' % (folder_path, img_name) with open(filename, 'wb') as f: f.write(content_img) infos['img'] = img_name print('提取信息成功') except Exception as e: print('提取信息失败') return infos def printInfo(infos, inf): conn = pymysql.connect(host='localhost', port=3306, user='root', passwd='root', db='jh_project01', charset='utf8') cur = conn.cursor() sqlc = ''' create table news( id int primary key auto_increment, title varchar(60), img varchar(60), url varchar(100), html longtext)DEFAULT CHARSET=utf8; ''' try: cur.execute(sqlc) conn.commit() print("成功") except: print("错误") for item, i in enumerate(inf): # print(item,i.text) if item == 7: break infos=fillList(infos, i) sqla = ''' insert into news(title,img,url,html) values(%s,%s,%s,%s); ''' try: cur.execute(sqla, (infos['title'], infos['img'], infos['url'], infos['html'])) conn.commit() print("成功") except: print("失败") conn.commit() cur.close() conn.close() def main(): infos = {} driver = webdriver.Chrome() driver.get('https://www.toutiao.com/ch/news_tech/') js = "var q=document.documentElement.scrollTop=500" driver.execute_script(js) inf = driver.find_elements_by_xpath('//div[@class="wcommonFeed"]/ul/li[@class="item "]/div[@class="item-inner y-box"]') del inf[0] print(inf) # print(len(inf)) printInfo(infos,inf) driver.close() main()

你可能感兴趣的:(webdriver)