import csv
import os.path
import random
import re
import traceback
from time import sleep
import grequests
import requests
from faker import Factory
from lxml import etree
class Iask:
def __init__(self):
print('程序开始运行')
# self.url = 'http://kan.iask.sina.com.cn/index/ajax.html?pageNum=1&category=&areaCode=all'
# 此为第一页
self.url_list = [f'http://kan.iask.sina.com.cn/index/ajax.html?pageNum={i}&category=&areaCode=all'for i in range(1,9)]
if not os.path.exists('./news'):
os.mkdir('./news')
print('创建文件夹成功')
def get_headers(self):
headers = {
'Host': 'kan.iask.sina.com.cn',
'Referer': 'http://kan.iask.sina.com.cn/area/all.html',
# 'User-Agent':Factory.create('zh-CN').user_agent()
'user-agent':'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
return headers
def get_all_url(self):
with open('./all_detail_url.csv','w',encoding='utf-8',newline='') as file:
writer = csv.writer(file)
for url in self.url_list:
html = requests.get(url=url,headers=self.get_headers(),timeout=20)
if html.status_code != 200:
while True:
print('访问失败,正在重新访问')
headers = {
'user-agent': Factory.create('zh-CN').user_agent(),
'Host': 'kan.iask.sina.com.cn',
'Referer': 'http://kan.iask.sina.com.cn/area/all.html',
}
html = requests.get(url=url, headers=headers, timeout=20)
if html.status_code == 200:
break
sleep(random.randint(1,3))
# 访问失败异常处理
tree = etree.HTML(html.text)
li_list = tree.xpath('//ul[@class="foreigner-list"]/li')
for li in li_list:
try:
title = li.xpath('.//div[@class="text-con"]/h4/a/text()')[-1].strip()
title = ''.join(re.findall(r'[^*"/:?\\|<>]',title,re.S)).replace('.','')
if not os.path.exists(f'./news/{title}'):
os.mkdir(f'./news/{title}')
with open(f'./news/{title}/pic.jpg','wb') as pic:
img_url = li.xpath('./div[@class="pic-con"]//img/@src')[0]
img_html = requests.get(url=img_url,timeout=20)
pic.write(img_html.content)
detail_url = 'http://kan.iask.sina.com.cn' + li.xpath('./div[@class="text-con"]/h4/a/@href')[0]
data = [img_url,detail_url]
writer.writerow(data)
except Exception as e:
print(e,traceback.format_exc())
print('一页url已经提取完毕')
sleep(random.randint(1, 3))
# 获得一页新闻的每个的url,储存图片,创建以标题命名的文件夹
def get_all_info(self):
with open('./all_detail_url.csv','r',encoding='utf-8') as file:
reader = csv.reader(file)
url_list = [url[-1] for url in reader][:]
req_list = [grequests.get(url=url,headers=self.get_headers(),timeout=20) for url in url_list]
html_list = grequests.imap(requests=req_list,size=20,exception_handler=self.handler_exception)
for html in html_list:
try:
tree = etree.HTML(html.text)
title = tree.xpath('//h1[@class="detail-title"]/text()')[0].strip()
title = ''.join(re.findall(r'[^*"/:?\\|<>]', title, re.S)).replace('.','')
content = tree.xpath('//p[@class="abstract-con"]//text()')[0]
comment_img = tree.xpath('//div[@class="article-con"]/img/@src')
num = len(comment_img)
for i in range(num):
img_html = requests.get(url=comment_img[i],timeout=20)
with open(f'./news/{title}/pic_{i}.jpg','wb') as pic:
pic.write(img_html.content)
comment_list = tree.xpath('string(//div[@class="article-con"])')[:]
with open(f'./news/{title}/{title}.txt','w',encoding='utf-8') as f:
f.write(content)
f.write('\n')
f.write(comment_list)
sleep(random.randint(1, 3))
except Exception as e:
print(e, traceback.format_exc())
print('一条数据爬取完毕')
def handler_exception(self,Requests,Exception):
print(f'{Requests.url}访问失败')
if __name__ == '__main__':
i = Iask()
i.get_all_url()
i.get_all_info()
2.aiwangzhan
https://top.aizhan.com
import csv
import random
import traceback
from time import sleep
import grequests
import requests
from faker import Factory
from lxml import etree
class Aiwangzhan:
def __init__(self):
print('程序开始运行')
self.url_list = [
f'https://top.aizhan.com/area/p{i}.html' for i in range(1,101) # 国内网站
]
self.url_list_abroad = [f'https://top.aizhan.com/abroadarea/p{i}.html' for i in range(1, 101)] # 国外网站
def get_headers(self):
headers = {
'Host': 'top.aizhan.com',
# 'Referer': 'https://top.aizhan.com/topic/',
'User-Agent':Factory.create('zh-CN').user_agent()
# 'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
return headers
def get_all_area_url(self):
# for url in self.url_list:
req_list = [grequests.get(url=url,headers=self.get_headers(),timeout=20) for url in self.url_list]
html_list = grequests.imap(requests=req_list,size=20,exception_handler=self.handler_exception)
with open(f'./data_area.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for html in html_list:
try:
tree = etree.HTML(html.text)
li_list = tree.xpath('//div[@class="list"]/ul/li')
for li in li_list:
rank = li.xpath('.//div[@class="alexa"]/span[1]/text()')[0].split(':')[-1]
title = li.xpath('.//h2/a/text()')[0]
href = f"https://{li.xpath('.//h2/em/text()')[0]}"
introduce = li.xpath('.//p/a/text()')[0]
data = [rank,title,href,introduce]
writer.writerow(data)
except Exception as e:
print(e,traceback.format_exc())
print(f'{html.url}访问失败')
sleep(random.randint(0,2))
print('国内网站已经爬取完毕')
def get_all_abord_url(self):
req_list = [grequests.get(url=url, headers=self.get_headers(), timeout=20) for url in self.url_list_abroad]
html_list = grequests.imap(requests=req_list, size=20, exception_handler=self.handler_exception)
with open(f'./data_abroad.csv', 'w', encoding='utf-8', newline='') as file:
writer = csv.writer(file)
for html in html_list:
try:
tree = etree.HTML(html.text)
li_list = tree.xpath('//div[@class="list"]/ul/li')
for li in li_list:
rank = li.xpath('.//div[@class="alexa"]/span[1]/text()')[0].split(':')[-1]
title = li.xpath('.//h2/a/text()')[0]
href = f"https://{li.xpath('.//h2/em/text()')[0]}"
introduce = li.xpath('.//p/a/text()')[0]
data = [rank, title, href, introduce]
writer.writerow(data)
except Exception as e:
print(e, traceback.format_exc())
print(f'{html.url}访问失败')
sleep(random.randint(0, 2))
print('国外网站已经爬取完毕')
def handler_exception(self, Requests, Exception):
url = Requests.url
html = requests.get(url=url,headers=self.get_headers(),timeout=20)
if html.status_code == 200:
return html
else:
print(f'{Requests.url}访问失败')
if __name__ == '__main__':
a = Aiwangzhan()
a.get_all_area_url()
a.get_all_abord_url()
3 发送邮件
# -*- coding: utf-8 -*-
import requests, bs4
import smtplib
import schedule
import time
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
from email.header import Header
# account = '{0}'.format('发件人qq邮箱')
# password = '{0}'.format('qq邮箱授权码')
# receiver = '{0}'.format('收件人163邮箱或者qq邮箱')
account = '{0}'.format('[email protected]')
password = '{0}'.format('qwertyuiop')
receiver = '{0}'.format('[email protected]')
# 更改自己的账户和授权码
# 爬虫任务,获取sobooks网站上的书名和作者,其中页面的话,可以根据自己需求进行修改
def recipe_spider():
list_all = ''
num = 0
for a in range(1, 3):
n = '{0}{1}'.format('https://sobooks.cc/page/', a)
res = requests.get(n)
res.encoding = res.apparent_encoding
bs = BeautifulSoup(res.text, 'html.parser')
# print(bs)
books = bs.find_all('h3')
authors = bs.find_all('p')
for i in range(len(books)):
num = num + 1
book = books[i].text.strip()
author = authors[i + 1].text.strip()
# list_books.append([book,author])
# list_books.append(list_book)
n = '''
书名%s: %s,作者: %s
''' % (num, book, author)
list_all = list_all + n
return list_all
# 将获取到的内容发送邮件
def send_email(list_all):
global account, password, receiver
mailhost = 'smtp.qq.com'
qqmail = smtplib.SMTP()
qqmail.connect(mailhost, 25)
qqmail.login(account, password)
content = '亲爱的,今天书单' + list_all
print(content)
message = MIMEText(content, 'plain', 'utf-8')
subject = '今天看什么'
message['Subject'] = Header(subject, 'utf-8')
try:
qqmail.sendmail(account, receiver, message.as_string())
print('邮件发送成功')
except:
print('邮件发送失败')
qqmail.quit()
def job():
print('开始一次任务')
list_all = recipe_spider()
send_email(list_all)
print('任务完成')
if __name__ == '__main__':
# 定时任务,其中0.05是间隔的意思,以分钟为间隔,时间默认是整数。
schedule.every(0.05).minutes.do(job)
while True:
schedule.run_pending()
time.sleep(1)
4 pexel 图片下载
import csv
import random
import time
import grequests
import requests
from faker import Factory
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
def get_url_file():
opt = Options()
# opt.add_argument('--headless')
opt.add_experimental_option('useAutomationExtension', False)
opt.add_experimental_option("excludeSwitches", ['enable-automation'])
opt.add_argument('--disable-blink-features=AutomationControlled')
opt.add_argument(f'user_agent="{Factory.create().user_agent()}"')
driver = Chrome(options=opt)
url = 'https://www.pexels.com/zh-cn/search/%E6%8A%BD%E8%B1%A1/'
driver.get(url=url)
print('已请求链接,正在下拉')
driver.maximize_window()
time.sleep(2)
a = 1
for y in range(300):
js = f'window.scrollBy(0,{random.randint(2500,3500)})'
driver.execute_script(js)
time.sleep(random.randint(1,3))
print(f'正在下拉第{a}下')
a += 1
html = driver.page_source
# print(html)
tree = etree.HTML(html)
with open('url_file.csv','a',encoding='utf-8',newline='') as file:
url_file_csv = csv.writer(file)
div_list = tree.xpath('//div[@class="photos__column"]/div')
i = 1
for div in div_list:
try:
img_url = div.xpath('.//a[@class="js-photo-link photo-item__link"]/img/@srcset')[0].split(',')[0]
print(img_url)
url_file_csv.writerow([img_url])
print(f'第{i}条url写入文件')
i += 1
# print(html)
except Exception as e:
print(e)
driver.quit()
def save_png():
# with open('./url_file.csv','r',encoding='utf-8') as file:
# 访问所有url
with open('./exception.csv','r',encoding='utf-8') as file:
# 处理异常url
url_list = list(csv.reader(file))
headers = {
'user-agent':Factory.create().user_agent(),
# 'Accept-Encoding': 'gzip, deflate, br',
}
req_list = []
for url in url_list[:]:
req = grequests.get(url=url[0],headers=headers,timeout=30)
req_list.append(req)
html_list = grequests.imap(req_list,size=500,exception_handler=handler_exception)
i = 5860
for html in html_list:
try:
type = html.url.split('.')[-1].split('?')[0]
img_name = f'pexels_抽象_{i}.{type}'
with open(f'./img_1/{img_name}','wb') as f:
f.write(html.content)
print(f'第{i}个图片储存完毕')
except :
print(f'{html.url}访问异常')
i += 1
def handler_exception(Request,Exception):
url = Request.url
try:
html = requests.get(url=url,timeout=20)
if html.status_code == 200:
return html
except:
print(f'{url}访问失败')
if __name__ == '__main__':
# get_url_file()
# 获得所有包含url的url文件
save_png()
# 保存图片
5 history of today
import requests
from faker import Factory
from lxml import etree
url = 'https://tool.lu/todayonhistory/'
headers = {
'user-agent': Factory.create().user_agent()
}
html = requests.get(url=url,headers=headers)
# print(html.text)
tree = etree.HTML(html.text)
li_list = tree.xpath('//ul[@id="tohlis"]/li')
emjo_list = ['♠','♥','♣','♦']
i = 1
for li in li_list:
emjo = emjo_list[i%4]
title = li.xpath('./text()')[0]
href = li.xpath('./a/@href')[0]
print(emjo,title,href)
i += 1
6 节日倒计时
#coding=utf-8
#!/usr/bin/env python
# import datetime
# spring=datetime.datetime(2022,1,31,0,0,0) #春节日期
# today=datetime.datetime.now() #今天是几月几号
# day=(spring-today).days #得到还有几天
# second=(spring-today).seconds #得到还有几秒
# sec=second%60 #根据秒数得到还有几秒
# minute=second/60%60 #根据秒得到分钟数
# hour=second/60/60 #根据秒数得到小时
# if hour>24:
# hour=hour-24 #如果超过24小时,就要算超过1天,所以要减去24
# print ("离今年春节还相差几天")
# print ("还有 %d 天 %d 小时 %d 分钟 %d 秒" %(day,hour,minute,sec))
import datetime
import requests
from loguru import logger
# 节日锚点
holiday_list = [
{"平安夜": "2021-12-24"},
{"圣诞节": "2021-12-25"},
{"元旦假期": "2022-01-01"},
{"春节假期": "2022-01-31"},
]
def get_holiday():
global holiday_list
"""
获取配置中的节日设置
:return: list——>[{'节日名':'节日日期'}]
"""
holiday_content = ''
# 今天日期
now_str = datetime.datetime.now().strftime('%Y-%m-%d')
now = datetime.datetime.strptime(now_str, "%Y-%m-%d")
for holiday_info in holiday_list:
holiday_name = list(holiday_info.keys())[0]
holiday_date = holiday_info[holiday_name]
future = datetime.datetime.strptime(holiday_date, "%Y-%m-%d")
days = (future - now).days
holiday_content = holiday_content + '距离' + holiday_name + '还有' + str(days) + '天' + '\n'
return holiday_content
def get_tg():
"""
获取日记
:return: bool or str
"""
url = f"https://fabiaoqing.com/jichou/randomrj.html"
try:
res = requests.post(url=url).json()
return res['tgrj'] + '\n'
except:
return False
def get_weather():
"""
获取天气预报
:return: str or false
"""
url = f"http://apis.juhe.cn/simpleWeather/query"
params = {
'city': '深圳',
'key': '7612ddda2313a41481327cbef5261b46',
}
try:
res = requests.get(url=url, params=params).json()
now_str = datetime.datetime.now().strftime('%Y-%m-%d')
weather_content = f"""【摸鱼办公室】\n今天是 {now_str} 星期 {datetime.datetime.now().weekday() + 1}\n{res['result']['city']} 当前天气 {res['result']['realtime']['info']} {res['result']['realtime']['temperature']}摄氏度\n早上好,摸鱼人!上班点快到了,收拾收拾,该吃饭吃饭,该溜达溜达,该上厕所上厕所。别闲着\n"""
return weather_content
except:
return False
if __name__ == '__main__':
holiday_content = get_holiday()
if not holiday_content:
logger.error(f"节日为空。")
holiday_content = ''
else:
logger.info(f"获取到节日:\n{holiday_content}")
tg_content = get_tg()
if not tg_content:
logger.error(f"日记为空。")
tg_content = ''
else:
logger.info(f"获取到日记:\n{tg_content}")
weather_content = get_weather()
if not weather_content:
logger.error(f"天气为空。")
weather_content = ''
else:
logger.info(f"获取到天气:\n{weather_content}")
complete_content = weather_content + holiday_content + tg_content + '工作再累 一定不要忘记摸鱼哦!有事没事起身去茶水间去厕所去廊道走走,别老在工位上坐着钱是老板的,但命是自己的'
logger.info(f"整合内容开始推送:\n{complete_content}")
7 英文小说
import csv
import os
import random
import re
import time
from time import sleep
import grequests
import requests
from faker import Factory
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
def get_novel_url():
url = 'https://www.wuxiaworld.com/novels'
opt = Options()
# opt.add_argument('--headless')
driver = Chrome(options=opt)
try:
driver.get(url)
driver.maximize_window()
sleep(1)
for i in range(7):
js = 'window.scrollBy(0,2000)'
driver.execute_script(js)
sleep(1)
print('已经拉到了底部')
sleep(2)
html = driver.page_source
with open('novel.csv','w',encoding='utf-8',newline='') as file:
writer = csv.writer(file)
tree = etree.HTML(html)
div_list = tree.xpath('//div[@id="search-results"]/div')
for div in div_list:
img = div.xpath('.//div[@class="css-1d6p0h5-NovelItemImageContainer e1amv7yz10"]/a/img/@data-src')[0]
title = div.xpath('.//div[@class="novel-title css-1f885ls-NovelTitle e1amv7yz21"]/a/text()')[0]
url = 'https://www.wuxiaworld.com' + div.xpath('.//div[@class="novel-title css-1f885ls-NovelTitle e1amv7yz21"]/a/@href')[0]
data = [img,title,url]
print(data)
writer.writerow(data)
# print(html)
# sleep(2)
except Exception as e:
print(e)
finally:
driver.quit()
# 只能爬出21个url,直接在浏览器上复制的
def get_headers():
headers = {
'referer': 'https://www.wuxiaworld.com/novels',
'user-agent': Factory.create('zh-CN').user_agent(),
}
return headers
def get_info():
with open('./novel.csv','r',encoding='utf-8') as file:
url_list = list(csv.reader(file))
url_list = [url[0] for url in url_list[:]]
print(url_list)
req_list = [grequests.get(url=url,headers=get_headers(),timeout=20) for url in url_list]
html_list = grequests.imap(req_list,size=10,exception_handler=handler_exception)
# 10个并发
for html in html_list:
start_time = time.perf_counter()
try:
tree = etree.HTML(html.text)
name = tree.xpath('//div[@class="novel-body"]/h2/text()')[0]
author = tree.xpath('//dt[contains(text(),"Author")]/following-sibling::*/text()')[0]
prew = '\n'.join(tree.xpath('//div[@class="fr-view"]//text()')).replace(' ','')
if not os.path.exists(f'./novel/{name}'):
os.mkdir(f'./novel/{name}')
# 为每个创建单独的文件夹
with open(f'./novel/{name}/Synopsis.txt','w',encoding='utf-8') as synopsis:
synopsis.write(f'{name}\nAuthor:{author}\nIntroduction\n{prew}')
#保存简介,文件为txt
img_url = tree.xpath('//img[@class="media-object img-thumbnail"]/@src')[0]
with open(f'./novel/{name}/main_pic.jpg','wb') as pic:
req = requests.get(url=img_url)
pic.write(req.content)
# 保存主图
href_list = tree.xpath('//div[@class="panel-group"]/div/div[@role="tabpanel"]//div[@class="row"]/div//li/a/@href')
href_list = [f'https://www.wuxiaworld.com{href}'for href in href_list][:]
# 得到所有章节的url
req_list_cha = [grequests.get(url=href,headers=get_headers(),timeout=20) for href in href_list]
html_list_cha = grequests.imap(requests=req_list_cha,size=500,exception_handler=handler_exception)
# 500个并发
for html_cha in html_list_cha:
tree_cha = etree.HTML(html_cha.text)
try:
title = ''.join(tree_cha.xpath('//div[@class="caption clearfix"]//h4/text()')).replace(':','--').strip().replace('\n','')
title = re.findall(r'[^\*"/:?\\|<>]', title, re.S)
title = ''.join(title)
content = ''
p_list = tree_cha.xpath('//div[@id="chapter-content"]//p')
for p in p_list:
content += '\t' + ''.join(p.xpath('.//text()'))
content += '\n'*2
content = content.replace(' ','')
# content = '\n'.join(tree_cha.xpath('//div[@id="chapter-content"]//text()'))
with open(f'./novel/{name}/{title}.txt','w',encoding='utf-8') as t:
t.write(content)
except Exception as e:
print(f'访问章节出错,错误的原因{e}')
# 为每个章节创建一个txt文件
end_time = time.perf_counter()
print(f'{name}所有章节已经下载完毕')
print(f'下载 {name} 所用时间为{end_time-start_time}')
except Exception:
print(Exception)
finally:
print(f'{name}\t爬取完毕')
sleep(random.randint(1,3))
def handler_exception(Request,Exception):
url = Request.url
headers = {
'referer': 'https://www.wuxiaworld.com/novels',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
html = requests.get(url=url,headers=headers)
if html.status_code == 200 :
return html
else:
print(f'{Request.url}访问失败')
if __name__ == '__main__':
get_info()
8 短信验证码获取
# 短信轰炸机:感觉自己在犯法的边缘不断试探
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
# 设置代理
proxy = '220.191.64.149'
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--proxy-server = http://' + proxy)
broswer = webdriver.Chrome(options=chrome_options)
# 测试代理
# broswer.get('http://httpbin.org/get')
# 输入即将收到短信的手机号码
tel = 123154564
# 定位,并输入手机号码
def tel_num_try(input_tel):
print("【目前状态】:\n定位号码输入框,并输入手机号码...")
try:
print("使用id定位中...")
bot1 = broswer.find_element_by_id(input_tel)
bot1.send_keys(tel)
except:
print("【id定位失败!】:\n使用class_name定位中...")
bots2 = broswer.find_element(By.CLASS_NAME, input_tel)
time.sleep(1)
bots2.send_keys(tel)
# 定位并点击(方案1)
def tel_power_try_child1(btn):
print("【当前状态】:定位元素,点击鼠标..")
try:
print("使用id定位中...")
bot3 = broswer.find_element_by_id(btn)
# bot1 = broswer.find_element_by_css_selector(btn)
bot3.click()
time.sleep(1)
except Exception as e:
print("【id定位失败!】\n 使用by_xpath定位中...")
bots4 = broswer.find_element_by_xpath(btn)
bots4.click()
time.sleep(1)
# 定位并点击(方案2)
def tel_power_try_child2(btn):
print("【当前状态】:定位元素,点击鼠标..")
try:
print("使用id定位中...")
bot3 = broswer.find_element_by_id(btn)
# bot1 = broswer.find_element_by_css_selector(btn)
bot3.click()
time.sleep(1)
except:
print("【id定位失败!】\n使用CSS_SELECTOR定位中...")
bots4 = broswer.find_element(By.CSS_SELECTOR, btn)
bots4.click()
time.sleep(1)
# 调用两个点击方案
def tel_power_try(btn):
try:
tel_power_try_child1(btn)
except:
tel_power_try_child2(btn)
# 模板:两步获取验证码:手机号定位,验证码定位;
def tel_get_2(url, input_tel, btn):
# 获取地址
broswer.get(url)
# 定位手机号输入
tel_num_try(input_tel)
# 定位"获取验证码"按钮
tel_power_try(btn)
# 模板:三步获取验证码:into:登录定位,input_tel:手机号定位,btn:验证码定位;
def tel_get_3(url, into, input_tel, btn):
# 获取地址
broswer.get(url)
tel_power_try(into)
# 定位手机号输入
tel_num_try(input_tel)
# 定位"获取验证码"按钮
tel_power_try(btn)
# 模板:三步获取验证码:登录定位,手机号定位,验证码定位;
def tel_get_k3(url, into, input_tel, btn):
# 获取地址
broswer.get(url)
# 定位手机号输入
tel_num_try(input_tel)
tel_power_try(into)
# 定位"获取验证码"按钮
tel_power_try(btn)
# 模板:三步获取验证码:手机定位,滑动验证码检验(变速滑动)、验证码定位;(待续)
# 模板:三步获取验证码:手机定位,图像文字识别检验(涉及太广)、验证码定位;(待续)
class Spider_tel():
def __init__(self):
pass
def TongCheng_com(self):
url = "https://passport.58.com/reg/?path=https%3A//gz.58.com/&PGTID=0d100000-0000-33f9-63ec-9ca2641f5e25&ClickID=3"
input_tel = 'phone'
btn = '.getcode'
print('【TongCheng_com】')
tel_get_2(url, input_tel, btn)
def Guazi_com(self):
url = "https://www.guazi.com/qinhuangdao/dazhong/"
into = 'js-login-new'
input_tel = "phone-login-input"
btn = '.get-code'
print('【Guazi_com】')
tel_get_3(url, into, input_tel, btn)
def JianShu(self):
url = "https://www.jianshu.com/sign_up"
input_tel = 'user_mobile_number'
btn = 'send_code'
print('【JianShu】')
tel_get_2(url, input_tel, btn)
def SuNingYiGou(self):
url = "https://reg.suning.com/person.do?myTargetUrl=https%3A%2F%2Fwww.suning.com%2F%3Fsafp%3Dd488778a.uzD.0.acf325284e"
into = '.agree-btn'
input_tel = "mobileAlias"
btn = 'sendSmsCode'
print('【SuNingYiGou】')
tel_get_3(url, into, input_tel, btn)
def FanKe(self):
url = "https://www.fkw.com/reg.html"
input_tel = 'acct'
btn = '.button'
print('【FanKe】')
tel_get_2(url, input_tel, btn)
def WangyiYun(self):
# 难啃系数3颗星
url = "https://id.163yun.com/register?referrer=https://dun.163.com/dashboard&h=yd&"
into = ".yidun_intelli-text"
input_tel = "m-input"
btn = '.m-btn'
print('【WangyiYun】')
tel_get_k3(url, into, input_tel, btn)
def BeiRui(self):
url = 'https://console.oray.com/passport/register.html?fromurl=http%3A%2F%2Fdomain .oray.com%2F'
into = '//*[@id="tips-protocol-win"]/div/div/div/div[2]/p/ input[1]'
input_tel = "mobile"
btn = "re-get"
print('【BeiRui】')
tel_get_3(url, into, input_tel, btn)
def XueJia(self):
url = "https://cn.student.com/au/adelaide?utm_source=baidu&utm_medium=cpc&utm_campaign=3_destination_au_pc&utm_content=3_adelaide_g_web_p&utm_term=adelaide%E7%A7%9F%E6%88%BF%E7%BD%91#sign-up"
input_tel = 'input-field__input'
btn = '.send-button__text'
print('【XueJia】')
tel_get_2(url, input_tel, btn)
def run(self):
# pass
self.TongCheng_com()
# self.Guazi_com()
# self.JianShu()
# self.FanKe() #需要滑块验证(留待解决)
# self.SuNingYiGou()
# self.WangyiYun()
# self.BeiRui()
# self.XueJia()
s = Spider_tel()
s.run()
9 去na网er
import csv
import os
import random
import re
from time import sleep
import grequests
import redis
import requests
from faker import Factory
from lxml import etree
class Qunaer:
def __init__(self):
self.redis_conn = redis.Redis(host='172.22.30.106',port=6379,db=4,encoding='utf8')
self.start_num = int(input('请输入你要爬取的开始页(150以内的数字): \n'))
self.end_num = self.start_num + int(input('请输入你要爬取的页数: \n'))
if not os.path.exists('./data'):
os.mkdir('./data')
def get_headers(self,id):
headers = {
'user-agent':Factory.create().user_agent(),
# 'referer': 'https://travel.qunar.com/place/',
# 'referer': f'https://travel.qunar.com/travelbook/list/22-shenzhen-300118/hot_heat/{random.randint(1,100)}.htm',
'referer': f'https://user.qunar.com/passport/login.jsp?ret=https%3A%2F%2Ftravel.qunar.com%2Ftravelbook%2Fnote%2F{id}',
'cookie':'JSESSIONID=3BD99CE2B103F1175FA1C7217E9A7E0C; QN1=00007500306c363b50d8bde7; QN300=auto_4e0d874a; QN99=4785; QN205=auto_4e0d874a; QN277=auto_4e0d874a; _i=VInJOmi4r81qEol1m6DaxOvXgGcq; QunarGlobal=10.86.213.148_1147fecb_17b29f57065_168d|1628496924368; QN269=F30A77B0F8E911EBBC9DFA163E2DD765; QN601=faf3d53d71e28e30417a5d8a079b6790; QN48=00008a802f10363b5100236b; fid=8f2de4d6-b651-43e9-9717-fc0ac5ec5916; HN1=v15c2748a262d3008af33574ac54311d63; HN2=qusnngrcknuql; viewdist=300118-1; uld=1-300118-1-1628672592; qunar-assist={%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false}; qunar-assist-ignore=[%22https://www.qunar.com/%22]; QN163=0; QN6=auto_4e0d874a; QN25=6602c1e0-ae80-4659-9619-e709a3d77216-9f992f90; QN42=mjiq0111; _q=U.fpnhmxe2812; _t=27387490; csrfToken=AzcqVpY4bmfLMKkfAbsmWnsdMT2fGmsg; _s=s_5452FRSKU2J6MQYLQ37QBEVPBY; _v=PC1b8KbOc-0LeyQZG6e3TZLMo8bVpObGJcAYeGuaJwJGw9GJo65ZmidayQKVThjZ0obxA_QT2HXDhmwslkfgrH_LFxth3Wj7ii1oHl5FKEaY3uhpstLGWgH03UDtcXwH3l6I6XrDB78BXJRA8Zp_XLXvN3S-4JnRwS9SS634gDSG; _vi=aEpXhPrURgDs599zZE5R59dj3CxeNdLAX3dIO9nwJRRvJycK9x4xHYBAxqTtQjomkSugVIswwqIZ_eNzwLnbR3FIqFFd_tuhvRi9FM3WAzKeQXBc792NETkD2cxGdm7o3gZIjpYOSfe7docvcAZabnoSHAZOZBfF3JSEc0hBXRAa; viewbook=7042357|7397301|7397301|6313894|7397301; QN44=fpnhmxe2812; QN267=8958728936dd37643; Hm_lvt_c56a2b5278263aa647778d304009eafc=1628672633,1628673661,1628734107,1628734222; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1628734222; QN271=4366dc9e-ac3c-496d-b30b-ac3e1c66266a'
}
return headers
def get_main_headers(self):
headers = {
'user-agent': Factory.create().user_agent(),
# 'referer': 'https://travel.qunar.com/place/',
'referer': f'https://travel.qunar.com/travelbook/list/22-shenzhen-300118/hot_heat/{random.randint(1,100)}.htm',
# 'referer': f'https://user.qunar.com/passport/login.jsp?ret=https%3A%2F%2Ftravel.qunar.com%2Ftravelbook%2Fnote%2F{id}',
'cookie': 'JSESSIONID=3BD99CE2B103F1175FA1C7217E9A7E0C; QN1=00007500306c363b50d8bde7; QN300=auto_4e0d874a; QN99=4785; QN205=auto_4e0d874a; QN277=auto_4e0d874a; _i=VInJOmi4r81qEol1m6DaxOvXgGcq; QunarGlobal=10.86.213.148_1147fecb_17b29f57065_168d|1628496924368; QN269=F30A77B0F8E911EBBC9DFA163E2DD765; QN601=faf3d53d71e28e30417a5d8a079b6790; QN48=00008a802f10363b5100236b; fid=8f2de4d6-b651-43e9-9717-fc0ac5ec5916; HN1=v15c2748a262d3008af33574ac54311d63; HN2=qusnngrcknuql; viewdist=300118-1; uld=1-300118-1-1628672592; qunar-assist={%22show%22:false%2C%22audio%22:false%2C%22speed%22:%22middle%22%2C%22zomm%22:1%2C%22cursor%22:false%2C%22pointer%22:false%2C%22bigtext%22:false%2C%22overead%22:false}; qunar-assist-ignore=[%22https://www.qunar.com/%22]; QN163=0; QN6=auto_4e0d874a; QN25=6602c1e0-ae80-4659-9619-e709a3d77216-9f992f90; QN42=mjiq0111; _q=U.fpnhmxe2812; _t=27387490; csrfToken=AzcqVpY4bmfLMKkfAbsmWnsdMT2fGmsg; _s=s_5452FRSKU2J6MQYLQ37QBEVPBY; _v=PC1b8KbOc-0LeyQZG6e3TZLMo8bVpObGJcAYeGuaJwJGw9GJo65ZmidayQKVThjZ0obxA_QT2HXDhmwslkfgrH_LFxth3Wj7ii1oHl5FKEaY3uhpstLGWgH03UDtcXwH3l6I6XrDB78BXJRA8Zp_XLXvN3S-4JnRwS9SS634gDSG; _vi=aEpXhPrURgDs599zZE5R59dj3CxeNdLAX3dIO9nwJRRvJycK9x4xHYBAxqTtQjomkSugVIswwqIZ_eNzwLnbR3FIqFFd_tuhvRi9FM3WAzKeQXBc792NETkD2cxGdm7o3gZIjpYOSfe7docvcAZabnoSHAZOZBfF3JSEc0hBXRAa; viewbook=7042357|7397301|7397301|6313894|7397301; QN44=fpnhmxe2812; QN267=8958728936dd37643; Hm_lvt_c56a2b5278263aa647778d304009eafc=1628672633,1628673661,1628734107,1628734222; Hm_lpvt_c56a2b5278263aa647778d304009eafc=1628734222; QN271=4366dc9e-ac3c-496d-b30b-ac3e1c66266a'
}
return headers
def get_all_id(self):
url_list = [f'https://travel.qunar.com/travelbook/list/%E6%B7%B1%E5%9C%B3/hot_heat/{i}.htm' for i in range(self.start_num,self.end_num)]
req_list = [grequests.get(url,headers=self.get_main_headers(),timeout=15) for url in url_list]
html_list = grequests.imap(req_list,size=30)
href_list = []
i = 1
for html in html_list:
# if html.stauts_code == 200:
tree = etree.HTML(html.text)
li_list = tree.xpath('//ul[@class="b_strategy_list "]/li')
for li in li_list:
href = li.xpath('.//h2/a/@href')[0].split('/')[-1]
href_list.append(href)
print(f'第{i}链接已经加入列表')
i += 1
with open('./del_href.csv','w',encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerow(href_list)
print(f'链接已经爬取完毕,共{len(href_list)}条数据')
# 得到所有页的数据id
def get_del_url(self):
base_url = 'https://travel.qunar.com/travelbook/note/'
with open('./del_href.csv','r',encoding='utf-8') as file:
lines = list(csv.reader(file))[0]
# print(lines)
a = 1
# 读取每一行,此时lines是一个大列表
href_list = [base_url + i for i in lines]
# print(href_list)
# print(len(href_list))
req_list = []
for url in href_list:
ex = self.redis_conn.sadd('qunaer_url',url)
if ex == 1:
req = grequests.get(url,headers=self.get_headers(url.split('/')[-1]),proxies=self.get_proxy(),timeout=20)
req_list.append(req)
else:
print('重复数据,无需爬取')
continue
html_list = grequests.imap(req_list,size=5,exception_handler=self.handler_exception)
for html in html_list:
if '登录' in html.text:
self.get_del_data(html)
print(f'第{a}条数据已经解析完毕')
a += 1
else:
sleep(random.randint(0,1))
try:
html = requests.get(url=html.url,headers=self.get_headers(html.url.split('/')[-1]),proxies=self.get_proxy(),timeout=20)
except requests.exceptions.ConnectionError:
pass
if '登录' in html.text:
self.get_del_data(html)
print(f'第{a}条数据已经解析完毕')
a += 1
else:
# print('正在重新发送请求')
pass
#异步请求所有的url
def get_del_data(self,html):
tree = etree.HTML(html.text)
title = tree.xpath('//span[@id="booktitle"]/text() | //h1[@class="name"]/text()')[0]
title = ''.join(re.findall(r'[^\*"/:?\\|<>\.]', title, re.S)).strip()
file_path = f'./data/{title}'
if not os.path.exists(file_path):
os.mkdir(file_path)
text_path = file_path + f'/{title}.txt'
pic_url_list = tree.xpath('//img/@data-original')
with open(text_path, 'a', encoding='utf-8') as file:
div_list = tree.xpath('//div[@class="e_main"]/div')
for div in div_list:
big_title = div.xpath('.//div[@class="text"]/text()')[0]
file.write(big_title)
file.write('\n')
div_list_2 = div.xpath('./div[@class="period_ct"]/div')
for div in div_list_2:
medium_title = ''.join(div.xpath('.//div[@class="b_poi_title_box"]//text()'))
file.write(medium_title)
file.write('\n')
content_list = div.xpath('.//div[@class="imglst"]/div')
for content in content_list:
content = content.xpath('string(.)')[:]
file.write(content)
file.write('\n')
img_list = div.xpath('.//div[@class="imglst"]/dl')
if img_list:
for img in img_list:
img_text = img.xpath('string(.)')[:]
if img_text:
file.write(img_text)
file.write('\n')
img_req_list = [grequests.get(url) for url in pic_url_list]
img_html_list = grequests.imap(img_req_list, size=500)
for img_html in img_html_list:
name = img_html.url.split('/')[-1]
name_path = file_path + f'/{name}'
with open(name_path, 'wb') as file:
file.write(img_html.content)
# print(f'第{html.url.split("/")[-1]}已经解析完毕')
# 解析详情页的信息
def handler_exception(self,Requests,Exception):
# print('访问链接失败,正在重新尝试')
html = requests.get(url=Requests.url,headers=self.get_main_headers(),timeout=20)
return html
# 异常处理
if __name__ == '__main__':
q = Qunaer()
q.get_all_id()
q.get_del_url()