热点精选至少爬50个出来,存储成csv
每一行如下
标号(从1开始),标题,链接,…(前三个为必做,后面内容可以自己加)
import time
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
"""
使用selenium进行模拟登陆
1.初始化ChromDriver
2.打开腾讯新闻登陆页面
3.找到用户名的输入框,输入用户名
4.找到密码框,输入密码
5.提交用户信息
"""
name = '*******'
passwd = '*******'
driver = webdriver.Chrome('D:\Python\Python37\Scripts\chromedriver.exe')
driver.get('https://news.qq.com/')
html = driver.page_source
# 将窗口调整最大
driver.maximize_window()
# 休息2s
time.sleep(2)
current_window_1 = driver.current_window_handle
# 点击登录,切换账号密码登录
button = driver.find_element_by_xpath('//*[@id="login"]/div/div/a/em')
button.click()
driver.switch_to.frame(driver.find_element_by_id('ptlogin_iframe'))
button = driver.find_element_by_id('switcher_plogin')
button.click()
# 登录账号密码
email = driver.find_element_by_name('u')
#email = driver.find_element_by_xpath('//input[@name="email"]')
email.send_keys(name)
password = driver.find_element_by_xpath('//*[@id="p"]')
#password = driver.find_element_by_xpath("//input[@name='password']")
password.send_keys(passwd)
submit = driver.find_element_by_class_name("login_button")
time.sleep(2)
submit.click()
time.sleep(5)
# 模拟鼠标滚轮下滑
for i in range(1,100):
time.sleep(2)
driver.execute_script("window.scrollTo(window.scrollX, %d);"%(i*200))
html = driver.page_source
tree = etree.HTML(html)
results = []
contents = tree.xpath('//ul[2]//div[@class = "detail"]/h3/a')
links = tree.xpath('//ul[2]//div[@class = "detail"]/h3/a/@href')
for i in range(50):
results.append([i+1,contents[i].xpath('string(.)').strip(),links[i]])
for result in results:
print(result)
# 保存文件
import pandas as pd
name = ['index','title','url']
test = pd.DataFrame(columns = name,data = results)
# print(test)
test.to_csv('answer.csv',index = False,encoding = 'utf-8_sig')
import time
from selenium import webdriver
driver=webdriver.Chrome(executable_path="D:\Python\Python37\Scripts\chromedriver.exe")
driver.get("https://news.qq.com")
#了解ajax加载
for i in range(1,100):
time.sleep(2)
driver.execute_script("window.scrollTo(window.scrollX, %d);"%(i*200))
from bs4 import BeautifulSoup
html=driver.page_source
bsObj=BeautifulSoup(html,"lxml")
jxtits=bsObj.find_all("div",{"class":"jx-tit"})[0].find_next_sibling().find_all("li")
print("index",",","title",",","url")
for i,jxtit in enumerate(jxtits):
print(jxtit)
print('*'*80)
try:
text=jxtit.find_all("img")[0]["alt"]
except:
text=jxtit.find_all("div",{"class":"lazyload-placeholder"})[0].text
try:
url=jxtit.find_all("a")[0]["href"]
except:
print(jxtit)
print(i+1,",",text,",",url)
结论:
感觉还是我自己写的好,毕竟自己写的自己容易理解消化,同时,重点为了解 ajax加载,尝试过将window.scrollX 改为0,程序无法运行,效果十分不理想。