直接上代码
#coding=UTF-8
from urllib.request import urlopen
from pyquery import PyQuery as pq
import re
import pymongo
import threading
client=pymongo.MongoClient(host='localhost',port=27017)
table=client.taobao.mutitry47160
lock=threading.Lock()
i = 1
def save_to_mongo(result):
try:
if table.insert(result):
print('存储到Mongo成功')
except Exception:
print('存储到Mongo失败',result)
def download_son(Son_link,l):
Sonson_link='https://www.7160.com'+Son_link+'index_'+str(l)+'.html'
doc3=pq(Sonson_link,encoding='gbk')
image_son=doc3('.picsbox.picsboxcenter p a img').attr('src')
title_son=doc3('.picsbox.picsboxcenter p a img').attr('alt')
product2={'image':image_son,'title':title_son}
print(product2)
save_to_mongo(product2)
def father_link():
lock.acquire()
global i
i += 1
lock.release()
url='https://www.7160.com/rentiyishu/list_1_'+str(i)+'.html'
print('--------------------------------------'+str(i)+'--------------------------------------')
doc=pq(url,encoding='gbk')
items=doc('.news_bom-left li').items()
for item in items:
Son_link=item.find('a').attr('href')
doc2=pq('https://www.7160.com'+Son_link,encoding='gbk')
image_main=doc2('.picsbox.picsboxcenter p a img').attr('src')
title_main=doc2('.picsbox.picsboxcenter p a img').attr('alt')
product={'image':image_main,'title':title_main}
save_to_mongo(product)
#获取页码
page_num=doc2('body > div > div.center > div.NEWS > div.picmainer > div.itempage > a:nth-child(1)').text()
page_num=re.findall(r"\d+\.?\d*",page_num)
try:
page_num=int(page_num[0])
print('共%d页,开始爬取'%page_num)
for l in range(2,page_num+1):
r1=threading.Thread(target=download_son,args=(Son_link,l))
r1.start()
except Exception:
pass
def main():
for i in range(1,108):
t1=threading.Thread(target=father_link)
t1.start()
if __name__ == '__main__':
main()