爬取Fiverr网站 关于爬虫相关的工作和图片
#coding=utf-8
import re
import requests
import queue
import threading
class MyThread(threading.Thread):
def __init__(self,func,**kwargs):
super().__init__()
self.is_running=True
self.func=func
self.kwargs =kwargs
def run(self):
self.func(self,self.kwargs)
def stop(self):
self.is_running=False
class CrawlFiverrr:
def __init__(self):
self.crawlQueue = queue.Queue()
self.responseQueue = queue.Queue()
self.downloadQueue = queue.Queue()
def secherdule(self):
for page in range(1, 20):
print("page={}".format(page))
url = 'https://www.fiverr.com/categories/programming-tech/data-analysis-services?source=gallery-listing&page={}&page_size=100&offset=-2&filter=rating&ref=tool%3Apython'.format(
page)
self.crawlQueue.put(url)
def fetcher(self,parent,kwargs=None):
headers={
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
while True:
if parent.is_running==False:
break
url=self.crawlQueue.get()
req =requests.get(url,headers=headers)
content = bytes.decode(req.content)
self.responseQueue.put(content)
self.crawlQueue.task_done()
def processer(self,parent,kwargs=None):
while True:
if parent.is_running==False:
break
if self.responseQueue.empty():
continue
content = self.responseQueue.get()
titles =re.findall('\"title\"\:\"(.*?)\"',content)
print(len(titles))
with open("titles1.txt", 'a+') as f:
f.write("\n".join(titles))
img_refs= re.findall('\"cloud_img_main_gig\"\:\"(.*?)\"',content)
print(len(img_refs))
for img_url in img_refs:
self.downloadQueue.put(img_url)
self.responseQueue.task_done()
def download(self,parent,kwargs=None):
while True:
if parent.is_running==False:
break
if self.downloadQueue.empty():
continue
img_url = self.downloadQueue.get()
r = requests.get(img_url)
file=img_url.split("/")[-1]
with open("images/{}".format(file), 'wb') as f:
f.write(r.content)
self.downloadQueue.task_done()
if __name__=="__main__":
crawlFiverrrHelper = CrawlFiverrr()
crawlFiverrrHelper.secherdule()
threadsList=[]
t=MyThread(crawlFiverrrHelper.fetcher)
threadsList.append(t)
t.start()
for i in range(10):
t=MyThread(crawlFiverrrHelper.processer)
threadsList.append(t)
t.start()
for i in range(10):
t=MyThread(crawlFiverrrHelper.download)
threadsList.append(t)
t.start()
for t in threadsList:
t.join()