#!/usr/bin/env python #coding = utf-8 ''' 本爬虫是用来爬取6V电影网站上的电影资源的一个小脚本程序,爬取到的电影链接会通过网页的形式显示出来 ''' import requests import re from bs4 import BeautifulSoup as bs from queue import Queue from other import getUser_Agent import threading import sys import time headers = getUser_Agent.getUser_Agent() class Movielinks(threading.Thread): def __init__(self,que,filepath,totalcount,starttime): threading.Thread.__init__(self) self._que = que self._filepath = filepath self._totalcount = totalcount self._starttime = starttime def run(self): try: while not self._que.empty(): url = self._que.get() threading.Thread(target=self.showdetail).start() self.spider(url) except: print('error--->def run(self):') def spider(self,url): try: r = requests.get(url,headers) file = open(self._filepath,'a+') if r.status_code == 200: soup = bs(r.content.decode('gbk'),'html.parser') links = soup.find_all('td') title = re.search(r'《(.*?)》',soup.title.string) # print(title.group()) for link in links: if '下载帮助' not in str(link): hrefs = link.find_all('a') # print(link) # print('**' * 100) # mima = re.search(r'(.*?)',str(link)) # print(mima.group()) for href in hrefs: if 'ed2k' in href['href'] or '.torrent' in href['href'] or 'thunder' in href['href'] or 'ftp' in href['href']: # sys.stdout.write('\r'+'\t\t迅雷下载链接:%s'%href['href']) file.write('迅雷下载链接:%s ' % (href['href'],'_blank',title.group())) file.write('\n
') elif 'baidu' in href['href'] and re.search(r'密码:\w{4}',str(link)): wangpan_password = re.search(r'密码:\w{4}',str(link)) # sys.stdout.write('\r'+'\t\t百度网盘下载链接:%s 网盘%s'%(href['href'],wangpan_password.group())) file.write('百度网盘链接:%s%s ' % (href['href'], '_blank', title.group(),wangpan_password.group())) file.write('\n
') elif '正版观看' in str(href): # print('\t\t %s 该影片需要正版观看!!' % title.group()) file.write('%s需要正版版权才能观看' % title.group()) file.write('\n
') else: file.write('%s 的该条链接无法正常爬取,尽情谅解' % title.group()) file.write('\n
') # sys.stdout.write('\r'+'\t\terror--->def spider(self,url)-in:%s'%title.group()) # print() file.write('
') file.close() else: print('%s 该磁力链接已坏!!' % url) except: # print('error--->def spider(self,url)-out:%s' % title.group()) file.write('%s 无法正常爬取,尽情谅解' % title.group()) def showdetail(self): usetime = time.time() - self._starttime per = 100 - (float(self._que.qsize())/float(self._totalcount)) * 100 sys.stdout.write('\r'+'下载链接进度:%.2f %s 用时:%.3f 秒' % (float(per),'%',float(usetime))) def getMovieCount(searchid): url = 'http://www.6vhao.tv/e/search/result/?searchid=' + str(searchid) r = requests.get(url,headers) soup = bs(r.content,'html.parser') divs = soup.find(name='div',attrs='channellist') pages = re.search(r'\d+',divs.h2.string) print('%s 一共有%2.f页'%(divs.h2.string,float(pages.group())/20)) def getWantPagesUrls(url,startpage,endpage,searchid): urls = [] for i in range(startpage,endpage+1): link = 'http://www.6vhao.tv/e/search/result/index.php?page='+str(i)+'&searchid='+str(searchid) urls.append(link) return urls def getpagesLinks(urls): # 返回一个电影名字和链接对应的字典 pageslink_dic = {} moviename_list = [] moviehref_list = [] for url in urls: r = requests.get(url,headers) soup = bs(r.content,'html.parser') divs = soup.find_all('div',class_='listimg') for div in divs: moviehref = div.find('a')['href'] moviename = div.find('a').img['alt'] moviehref_list.append(moviehref) # moviename_list.append(moviename) # for i,name in zip(range(len(moviename_list)),moviename_list): # pageslink_dic[name] = moviehref_list[i] # print(moviehref_list) return moviehref_list def getqueue(urls): que = Queue() for url in urls: que.put(url) return que def main(): tishi = ''' *6v电影网站电影链接爬虫* #要搜索的常用关键字代码# 1.韩国--->185773 2.日本--->185691 3.国产--->186504 4.美国--->187181 5.英国--->188161 6.香港--->188461 7.喜剧--->185441 8.恐怖--->187193 9.悬疑--->190226 10.记录片--->187963 11.科幻--->189866 12.战争--->187830 13.动画--->187978 ''' print(tishi) keywords = {1:185773,2:185691,3:186504,4:187181,5:188161,6:188461,7:185441,8:187193,9:190226,10:187963,11:189866,12:187830,13:187978} threads = [] url = 'http://www.6vhao.tv/e/search/result/index.php?page=' searchid = input('请输入搜索关键字序号>>>') print('正在计算电影总量... ...') getMovieCount(keywords[int(searchid)]) startpage = int(input('请输入起始页码>>>')) endpage = int(input('请输入终止页码>>>')) filepath = input('请输入电影链接要保存的文件路径(扩展名是:.html)') print('The program is running,Please waiting... ...') urls = getpagesLinks(getWantPagesUrls(url,startpage,endpage,keywords[int(searchid)])) que = getqueue(urls) print('本次下载行为:从%d页到%d页,有%d部电影正在下载...'%(startpage,endpage,que.qsize())) thread_count = que.qsize() starttime = time.time() for i in range(thread_count): threads.append(Movielinks(que,filepath,thread_count,starttime)) for t in threads: t.start() for t in threads: t.join() if __name__ == '__main__': # ch = int(input('请输入选项(负数退出)>>>')) # while ch >0: # main() # ch = int(input('请输入选项(负数退出)>>>')) main()
在爬取网页时,需要模拟浏览器的操作,所以在爬取数据时需要模拟一个user-agent ,相关代码如下:
from random import randint def getUser_Agent(): headers = [ 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', #safari 5.1 – MAC 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', #safari 5.1 – Windows 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0', #IE 9.0 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', #IE 8.0 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)', #IE 7.0 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', # IE6.0 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', #Firefox 4.0.1 – MAC 'Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1', #Firefox 4.0.1 – Windows 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11', #Opera 11.11 – MAC 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11', #Opera 11.11 – Windows 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', #Chrome 17.0 – MAC 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)', #傲游(Maxthon) 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)', #腾讯TT 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', #世界之窗(The World) 2.x 'ozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)', #世界之窗(The World) 3.x 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)', #搜狗浏览器 1.x 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)', #360浏览器 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)', #Avant 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', #Green Browser ##移动端用户代理 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', #safari iOS 4.33 – iPhone 'Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', #safari iOS 4.33 – iPod Touch 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5', #safari iOS 4.33 – iPad 'Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', #Android N1 'MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1', #Android QQ浏览器 For android 'Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10', #Android Opera Mobile 'Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+', #Android Pad Moto Xoom 'Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0', #WebOS HP Touchpad 'Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124', #Nokia N97 'Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)', #Windows Phone Mango 'UCWEB7.0.2.37/28/999', #UC无 'Openwave/ UCWEB7.0.2.37/28/999', #UCOpenwave 'Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999', #UC Opera ] return {'User-Agent':headers[randint(0,len(headers)-1)]}