1.爬虫 dmoz_spider.py
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:donghui import scrapy import re from urllib.parse import quote from tutorial.items import DmozItem class DmozSpider(scrapy.Spider): name = "dmoz" allowed_domains = ["dmoz.org"] start_urls = [ "http://www.btkuai.org/word/" + quote("风光") + "_{}.html".format(n) for n in range(1,10) ] def savefile(self,filename,var): f = open("tutorial/res/"+filename+".csv","w+") #路径一定要写对 f.write(var) f.close() #print("保存完毕") def parse(self, response): url_head = 'http://www.btkuai.org' #filename = response.url.split("/")[-2] selector = response.xpath('//div[@id="container"]/div/ul/li/div[@class="T1"]') for sel in selector: title = sel.xpath('a/text()').extract()[0] link = url_head +(sel.xpath('a/@href').extract()[0]) if re.findall('([a-zA-z]+://[^\s]*html$)',link,re.S): #print(title, link) #self.savefile(filename, title + "," + link) item = DmozItem() item['title'] = title item['link'] = link yield item
2. Items items.py
import scrapy class DmozItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() title = scrapy.Field() link = scrapy.Field()
3. 代理IP middlewares.py
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:donghui # 导入随机模块 import random,json # 导入settings文件中的IPPOOL from .settings import IPPOOL # 导入官方文档对应的HttpProxyMiddleware from scrapy.contrib.downloadermiddleware.httpproxy import HttpProxyMiddleware from scrapy.conf import settings class IPPOOlS(HttpProxyMiddleware): with open("../EffectiveIp.json", 'r') as handler: ips = json.load(handler) handler.close() def __init__(self, ip=''): self.ip = ip def process_request(self, request, spider): proxyMode = settings['IPPoolMode'] if proxyMode==0: thisip = random.choice(settings['IPPOOL']) print("代理ip:%s" % thisip["http"]) request.meta["proxy"] = "http://" + thisip["http"] elif proxyMode==1: thisip = random.choice(IPPOOlS.ips) print("代理ip:%s" % thisip["http"]) request.meta["proxy"] = "http://" + thisip["http"]
4.代理商 uamid.py
#!/usr/bin/env python # -*- coding:utf-8 -*- # Author:donghui # 导入随机模块 import random # 导入settings文件中的UPPOOL from .settings import UPPOOL # 导入官方文档对应的HttpProxyMiddleware from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware class Uamid(UserAgentMiddleware): # 初始化 注意一定要user_agent,不然容易报错 def __init__(self, user_agent=''): self.user_agent = user_agent # 请求处理 def process_request(self, request, spider): # 先随机选择一个用户代理 thisua = random.choice(UPPOOL) print("当前使用User-Agent是:"+ thisua) request.headers.setdefault('User-Agent', thisua)
5.settings.py
# -*- coding: utf-8 -*- # Scrapy settings for tutorial project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME = 'tutorial' SPIDER_MODULES = ['tutorial.spiders'] NEWSPIDER_MODULE = 'tutorial.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'tutorial (+http://www.btkuai.org)' USER_AGENT = 'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12' # Obey robots.txt rules ROBOTSTXT_OBEY = True # 配置mongoDB MONGO_HOST = "127.0.0.1" # 主机IP MONGO_PORT = 27017 # 端口号 MONGO_DB = "btKuai" # 库名 MONGO_COLL = "fengguang" # collection #0 从配置文件拿 1 从代理json文件拿 IPPoolMode=1 # 设置IP池 IPPOOL = [{"http": "125.32.250.240:8060"},{"http": "183.159.93.165:61234"},{"http": "119.49.33.238:8060"},{"http": "119.187.120.118:8060"},{"http": "120.25.203.182:7777"},{"http": "121.17.18.219:8060"},{"http": "123.8.41.163:8060"},{"http": "119.41.236.180:8010"},{"http": "121.17.18.218:8060"},{"http": "114.55.0.166:8090"},{"http": "118.122.105.99:9000"},{"http": "45.115.39.139:7777"}] # 设置用户代理池 UPPOOL = [ "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" ] # 禁止本地Cookie COOKIES_ENABLED = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'tutorial.middlewares.MyCustomSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html DOWNLOADER_MIDDLEWARES = { #'tutorial.middlewares.MyCustomDownloaderMiddleware': 543, 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123, 'tutorial.middlewares.IPPOOlS' : 125, 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': 2, 'tutorial.uamid.Uamid': 1 } # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { #'tutorial.pipelines.SomePipeline': 300, 'tutorial.pipelines.BtKuaiMongo': 300, 'tutorial.pipelines.JsonWritePipline': 300 } # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'