python版本:3.6.1
开发工具:PyCharm社区版,Anaconda3
数据库:MongoDB
可视化MongoDB工具:MongoVUE
1.开启数据库后,打开MongoVUE使MongoDB可视化。
2.用PyCharm编写代码,爬取数据并保存到数据库中。
3.刷新数据库查看是否保存数据成功。
# -*- coding:utf-8 -*-
from lxml import etree
import requests
import re
import time
import random
from bs4 import BeautifulSoup
import pymongo
def get_max_page(url,headers):
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
time.sleep(2)
for h in soup.select('#post-list-posts > li > div.inner > a.thumb '):
image_id = h.get('href').split('/')[-1]
# post_sub.insert_one({'img_id': image_id})
each_pic_id = 'https://yande.re' + h.get('href')
print(image_id)
get_img_url(each_pic_id,headers,image_id)
def get_img_url(id_url,headers,image_id):
html = requests.get(id_url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
time.sleep(2)
for i in soup.select('#image'):
img_url = i.get('src')
post_sub.insert_one({'img_id':image_id, 'img_url': img_url}) # insert_one:插入一条数据,for:遍历, 一条一条插入
print(img_url)
if __name__ == '__main__':
tag = input('请输入你要下载的类型(标签):')
max_page = input('请输入你要下载的页数:')
for n in range(1, int(max_page)+1):
url = 'https://yande.re/post?page=' + str(n) + '&tags=' + str(tag)
user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36"
]
UA = random.choice(user_agent_list)
headers = {'User-Agent': UA,
# 'Referer':'https://yande.re/post',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Host': 'yande.re',
'Accept - Encoding': 'gzip,deflate,br',
'Accept - Language': 'zh - CN, zh;q=0.8',
'Connection': 'keep - alive',
}
connection = pymongo.MongoClient() # 连接MongDB数据库
post_info = connection.yande_test # 指定数据库名称(yande_test),没有则创建
post_sub = post_info.test # 获取集合名:test
get_max_page(url,headers)
4.将保存的数据导出csv
注:没有配置环境变量的话在Mongodb的bin目录下打开命令窗口输入语句:
mongoexport -d yande_test -c test -f _id,name,password,adress --type=csv -o 文件名.csv