本篇文章将爬取零度编程的列表 传送门
我们只爬取如下的列表
废话不多说 上代码 很菜 大神勿喷
import requests
from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool as ThreadPool
url="https://www.xcode.me/page/"
img=[]
data=[]
urls=[]
def page_one(u):
ha={
'Accept': 'text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng, * / *;q = 0.8',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36',
'Content-Type':'text/html; charset=utf-8'
}
res=requests.get(u)
result=BeautifulSoup(res.text,"lxml")
for item in result.select("article.excerpt"):
try:
src = item.select("a img.thumb")[0]["data-src"]
except:
src=item.select("a img.thumb")[0]["src"]
img.append(src)
title=item.select("header h2")[0].text
time=item.select("p.meta time")[0].text
look=item.select("p.meta .pv")[0].text.lstrip("阅读(").rstrip(")")
detail=BeautifulSoup(requests.get(item.select("header h2 a")[0]["href"]).text,"lxml")
content=str( detail.select("article.article-content")[0].text.strip("\n")[:-18])
newpwd=detail.select("article.article-content p span")
if len(newpwd)>0:
if "下载密钥:" in newpwd[0].text:
downPwd = newpwd[0].text.strip("下载密钥:")
downUrl = detail.select("article.article-content p a")[0]["href"]
else:
downPwd=""
downUrl=""
else:
downPwd = ""
downUrl = ""
data.append({'ImgUrl':src,'Title':title,'Time':time,'Look':look,'Content':content,'downPwd':downPwd,'downUrl':downUrl})
def CreateUrl():
for a in range(0,21):
urls.append(url+str(a))
def Start():
CreateUrl()
pool = ThreadPool(4)
results = pool.map(page_one, urls)
pool.close()
pool.join()
print(data)
from pymongo import MongoClient
settings = {
"ip":'localhost',
"port":27017,
"db_name" : "python",
"set_name" : "linduBlog"
}
class MyMongoDB(object):
def __init__(self):
try:
self.conn = MongoClient(settings["ip"], settings["port"])
except Exception as e:
print(e)
self.db = self.conn[settings["db_name"]]
self.my_set = self.db[settings["set_name"]]
def insert(self,dic):
print("inser...")
self.my_set.insert(dic)
def update(self,dic,newdic):
print("update...")
self.my_set.update(dic,newdic)
def delete(self,dic):
print("delete...")
self.my_set.remove(dic)
def dbfind(self,dic):
print("find...")
data = self.my_set.find(dic)
for result in data:
print(result["name"],result["age"])
if __name__ == '__main__':
Start()
m=MyMongoDB()
m.insert(data)