作为一名cv程序员,梦想是造飞机,现实是拧螺丝
普通的爬虫,无非就是固定的模板
import requests
import codecs,csv
import time
import random
headers={
#防盗链
'referer':'https://www.douyin.com/user/MS4wLjABAAAA27xpsfnWLkcy9tQJszZ5kXKtqrVyqtn6lrFDRmE1kJZ8tUom93dXxUK0pplVJnwQ?modal_id=7200628769755876648',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63',
#有的网站需要带cookie才能返回数据
'cookie':''
}
# ip池
ip_list = [
{'HTTP': '116.9.163.205:58080'},
{'HTTP': '61.216.185.88:60808'},
{'HTTP': '182.34.102.50:9999'},
{'HTTP': '183.236.232.160:8080'},
{'HTTP': '117.94.124.21:9000'},
{'HTTP': '210.5.10.87:53281'},
{'HTTP': '121.13.252.58:41564'},
{'HTTP': '121.13.252.60:41564'},
{'HTTP': '117.114.149.66:55443'},
{'HTTP': '112.14.47.6:52024'}
]
# 随机获取ip
# ip = ip_list.pop(random.randint(0, len(ip_list) - 1))
ip=random.choice(ip_list)
def getPlayUrl():
for i in range(0,1000):
try:
print('正在爬取'+str(i)+'页')
#请求url,一般只需要拼接url,即可获取多页
url = ''
res=requests.get(url,headers=headers,proxies=ip)
# print(res.text)
print(res)
#json格式,直接提取
data=res.json()['comments']
# print(data)
for item in data:
comments={}
comments['cid']=item['cid']
create_time=item['create_time']
comments['time']=time.strftime("%Y-%m-%d %H:%M",time.localtime(create_time))
# comments['user']=item['user']['nickname']
comments['comment']=item['text']
yield comments
time.sleep(1)
except Exception as e:
print(e)
break
if __name__=='__main__':
#保存数据
f=codecs.open('抖音评论_1.csv','a+',encoding='utf-8-sig')
filename=['cid','time','comment']
writer=csv.DictWriter(f,filename)
writer.writeheader()
for i in getPlayUrl():
print(i)
writer.writerow(i)
# getPlayUrl()
from bs4 import BeautifulSoup
import requests
import time
import codecs,csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
f=codecs.open('歌单_1.csv','w+', encoding='utf-8-sig')
filename=['歌单详情页地址','歌单标题','歌单播放量','歌单贡献者名字']
writer=csv.DictWriter(f,filename)
writer.writeheader()
j=0
for i in range(0, 656, 35):
time.sleep(2)
print('正在爬取'+str(j)+'页')
url = 'https://music.163.com/discover/playlist/?&order=hot&limit=35&offset=' + str(i)
response = requests.get(url=url, headers=headers)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# 获取包含歌单详情页网址的标签
ids = soup.select('.dec a')
# 获取包含歌单索引页信息的标签
lis = soup.select('#m-pl-container li')
print(len(lis))
for j in range(len(lis)):
# 获取歌单详情页地址
url = 'https://music.163.com'+ids[j]['href']
# 获取歌单标题
title = ids[j]['title']
# 获取歌单播放量
play = lis[j].select('.nb')[0].get_text()
# 获取歌单贡献者名字
user = lis[j].select('p')[1].select('a')[0].get_text()
# 输出歌单索引页信息
obj={'歌单详情页地址':url,'歌单标题':title,'歌单播放量':play,'歌单贡献者名字':user}
print(obj)
# 将信息写入CSV文件中
writer.writerow(obj)
j+=1
import requests
from lxml import etree
import json
import re
import pprint
import codecs,csv
import time
import random
headers={
'referer':'https://www.mafengwo.cn/mdd/citylist/10183.html',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.44',
'cookie':''
}
def getSpot():
for i in range(0,16):
try:
print('爬取第' + str(i) + '页……')
url='https://www.mafengwo.cn/mdd/base/list/pagedata_citylist'
data={
'mddid': '10183',
'page': i
}
res=requests.post(url=url,headers=headers,data=data,proxies=ip)
# print(res)
# print(res.text)
data=json.loads(res.text)
text=data['list']
html=etree.HTML(text)
total=html.xpath("//li[@class='item ']")
for item in total:
info={}
info['地址']=item.xpath(".//div[@class='title']/text()")[0]
info['累计游玩人数']=item.xpath(".//div[@class='nums']/b/text()")[0]
info['url']='https://www.mafengwo.cn/'+item.xpath('.//a/@href')[0]
yield info
time.sleep(2)
except Exception as e:
print(e)
break
if __name__=="__main__":
# ip池
ip_list = [
{'HTTP': '61.164.39.68:53281'},
{'HTTP': '27.42.168.46:55481'},
{'HTTP': '116.9.163.205:58080'},
{'HTTP': '182.34.102.50:9999'},
{'HTTP': '183.236.232.160:8080'},
{'HTTP': '113.124.86.24:9999'},
{'HTTP': '210.5.10.87:53281'}
]
# 随机获取ip
ip = ip_list.pop(random.randint(0, len(ip_list) - 1))
f=open('日本旅游目的地.csv','a+',newline='',encoding='utf-8-sig')
filename=['地址','累计游玩人数','url']
writer=csv.DictWriter(f,filename)
# writer.writeheader()
for comment in getSpot():
print(comment)
writer.writerow(comment)
个人觉得xpath比较简单,用的比较上手
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import pandas as pd
import time
data = pd.read_csv('zhiwu.csv')
names=data['name']
urls=data['link']
def add_options():
print("—————————— options ——————————")
# 创建谷歌浏览器驱动参数对象
chrome_options = webdriver.ChromeOptions()
# 不加载图片
# prefs = {"profile.managed_default_content_settings.images": 2}
# chrome_options.add_experimental_option("prefs", prefs)
# 使用无界面浏览器模式!!
chrome_options.add_argument('--headless')
# 使用隐身模式(无痕模式)
chrome_options.add_argument('--incognito')
# 禁用GPU加速
chrome_options.add_argument('--disable-gpu')
return chrome_options
# 配置Selenium ChromeDriver
# service = Service('path/to/chromedriver')
# driver = webdriver.Chrome()
driver = webdriver.Chrome(options=add_options())
# 设置等待时间
wait = WebDriverWait(driver, 10)
# 循环爬取每一页的数据
for name,url in zip(names,urls):
# 访问超链接并提取数据
driver.get(url)
# 提取评估信息、形态特征、地理分布等信息
try:
eval_info = driver.find_element(By.XPATH, '//*[@id="swx"]').text
except:
eval_info = ''
try:
morpho_feature = driver.find_element(By.XPATH, '//*[@id="tezheng"]').text
# print(morpho_feature)
except:
morpho_feature = ''
try:
geo_distribution = driver.find_element(By.XPATH, '//*[@id="chandi"]').text
# print(geo_distribution)
except:
geo_distribution = ''
try:
func_application = driver.find_element(By.XPATH, '//*[@id="gongneng"]').text
# print(func_application)
except:
func_application = ''
try:
protection_value = driver.find_element(By.XPATH, '//*[@id="protvalue"]').text
# print(protection_value)
except:
protection_value = ''
try:
protection_measure = driver.find_element(By.XPATH, '//*[@id="protway"]').text
# print(protection_measure)
except:
protection_measure = ''
try:
cultivation_points = driver.find_element(By.XPATH, '//*[@id="growway"]').text
# print(cultivation_points)
except:
cultivation_points = ''
try:
iframe = driver.find_element(By.XPATH, '//*[@id="Label1"]/iframe')
driver.switch_to.frame(iframe)
img_url = driver.find_element(By.XPATH, '//*[@id="pinfo"]/a').get_attribute('href')
print(img_url)
except:
img_url = ''
# 保存为csv文件,文件以表格的中文名列进行命名
with open(f'{name}.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['评估信息', '形态特征', '地理分布', '功能用途', '保护价值', '保护措施', '栽培要点','图片链接'])
writer.writerow([eval_info, morpho_feature, geo_distribution, func_application,
protection_value,protection_measure, cultivation_points,img_url])
time.sleep(1)
# 关闭浏览器
driver.quit()
import requests
import codecs,csv
import time
import json
from lxml import etree
from pprint import pprint
headers={
#防盗链
'referer':'https://news.sina.com.cn/china/',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.63',
}
def getInfo():
for i in range(1,31):
try:
print('正在爬取'+str(i)+'页')
#请求url,一般只需要拼接url,即可获取多页
url = 'https://feed.sina.com.cn/api/roll/get?pageid=121&lid=1356&num=20&versionNumber=1.2.4&page='+str(i)+'&encode=utf-8&callback=feedCardJsonpCallback'
res=requests.get(url,headers=headers)
# print(res.text)
print(res)
#json格式,直接提取
text=res.text.split('try{feedCardJsonpCallback(')[1].split(');}catch(e){};')[0]
# print(text)
data=json.loads(text)
# pprint(data)
for item in data['result']['data']:
info={}
info['标题']=item['title']
info['简介']=item['intro']
create_time=int(item['ctime'])
info['发布时间']=time.strftime("%Y-%m-%d %H:%M",time.localtime(create_time))
info['主题']=item['keywords']
try:
info['评论数']=item['comment_total']
except:
info['评论数']=''
#爬取详情页
d_url=item['url']
res_1=requests.get(d_url,headers=headers)
res_1.encoding='utf-8'
html=etree.HTML(res_1.text)
info['详情']=html.xpath('//*[@id="article"]')[0].xpath('string(.)').replace('\n','').replace('\t','')
yield info
time.sleep(2)
except Exception as e:
print(e)
break
if __name__=='__main__':
#保存数据
f=codecs.open('新浪新闻.csv','w+',encoding='utf-8-sig')
filename=['标题','简介','发布时间','主题','评论数','详情']
writer=csv.DictWriter(f,filename)
writer.writeheader()
for info in getInfo():
print(info)
writer.writerow(info)