Decoxy

从头学习爬虫（十七）重构篇----WebMagic框架分析之spider

这系列文章主要分析分析webmagic框架，没有实战内容，如有实战问题可以讨论，也可以提供技术支持。

欢迎加群313557283（刚创建），小白互相学习~

Spider

package us.codecraft.webmagic;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.downloader.Downloader;
import us.codecraft.webmagic.downloader.HttpClientDownloader;
import us.codecraft.webmagic.pipeline.CollectorPipeline;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.pipeline.Pipeline;
import us.codecraft.webmagic.pipeline.ResultItemsCollectorPipeline;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.Scheduler;
import us.codecraft.webmagic.thread.CountableThreadPool;
import us.codecraft.webmagic.utils.UrlUtils;
import us.codecraft.webmagic.utils.WMCollections;

import java.io.Closeable;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;

/**
 * Entrance of a crawler.

 * A spider contains four modules: Downloader, Scheduler, PageProcessor and
 * Pipeline.

 * Every module is a field of Spider. 

 * The modules are defined in interface. 

 * You can customize a spider with various implementations of them. 

 * Examples: 

 * 

 * A simple crawler: 

 * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
 * "http://my.oschina.net/*blog/*")).run();

 * 

 * Store results to files by FilePipeline: 

 * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
 * "http://my.oschina.net/*blog/*")) 

 * .pipeline(new FilePipeline("/data/temp/webmagic/")).run(); 

 * 

 * Use FileCacheQueueScheduler to store urls and cursor in files, so that a
 * Spider can resume the status when shutdown. 

 * Spider.create(new SimplePageProcessor("http://my.oschina.net/",
 * "http://my.oschina.net/*blog/*")) 

 * .scheduler(new FileCacheQueueScheduler("/data/temp/webmagic/cache/")).run(); 

 *
 * @author [email protected] 

 * @see Downloader
 * @see Scheduler
 * @see PageProcessor
 * @see Pipeline
 * @since 0.1.0
 */
//实现两个接口一个Runnable,一个Task
/**
 * 对于每个任务返回一个
 * @return uuid
 *//*
public String getUUID();
*//**
 * 返回site对象
 * @return site
 *//*
public Site getSite();*/

public class Spider implements Runnable, Task {
	
    //下载器
    protected Downloader downloader;
    //流管道，处理数据 
    protected List pipelines = new ArrayList();
    //处理模块,解析页面，抽取有用信息
    protected PageProcessor pageProcessor;
    //请求集合
    protected List startRequests;
    //site 放请求信息
    protected Site site;
    //uuid
    protected String uuid;
    //负责管理待抓取的URL，以及一些去重的工作
    protected Scheduler scheduler = new QueueScheduler();
    //日志
    protected Logger logger = LoggerFactory.getLogger(getClass());
    //自己的线程池
    protected CountableThreadPool threadPool;
    //Executor直接的扩展接口,也是最常用的线程池接口,我们通常见到的线程池定时任务线程池都是它的实现类
    protected ExecutorService executorService;
    //线程数
    protected int threadNum = 1;
    //初始化提供原子操作
    protected AtomicInteger stat = new AtomicInteger(STAT_INIT);
    //处理完全标志位
    protected boolean exitWhenComplete = true;
    //初始化
    protected final static int STAT_INIT = 0;
    //运行
    protected final static int STAT_RUNNING = 1;
    //结束
    protected final static int STAT_STOPPED = 2;
    //不清楚
    protected boolean spawnUrl = true;
    //销毁标志位
    protected boolean destroyWhenExit = true;
    //重入锁(ReentrantLock)是一种递归无阻塞的同步机制
    private ReentrantLock newUrlLock = new ReentrantLock();
    //条件
    private Condition newUrlCondition = newUrlLock.newCondition();
    //爬虫监听集合 存放请求是否成功
    private List spiderListeners;
    //初始化提供原子操作
    private final AtomicLong pageCount = new AtomicLong(0);
    //开始时间
    private Date startTime;
    //空闲休眠30秒
    private int emptySleepTime = 30000;

    //通过加入PageProcessor创建返回Spider
    /**
     * create a spider with pageProcessor.
     *
     * @param pageProcessor pageProcessor
     * @return new spider
     * @see PageProcessor
     */
    public static Spider create(PageProcessor pageProcessor) {
        return new Spider(pageProcessor);
    }
    
    //默认要初始化site
    /**
     * create a spider with pageProcessor.
     *
     * @param pageProcessor pageProcessor
     */
    public Spider(PageProcessor pageProcessor) {
        this.pageProcessor = pageProcessor;
        this.site = pageProcessor.getSite();
    }

    /**
     * Set startUrls of Spider.

     * Prior to startUrls of Site.
     *
     * @param startUrls startUrls
     * @return this
     */
    public Spider startUrls(List startUrls) {
    	//检查是否已经运行
        checkIfRunning();
        //将url 转换成request
        this.startRequests = UrlUtils.convertToRequests(startUrls);
        return this;
    }

    /**
     * Set startUrls of Spider.

     * Prior to startUrls of Site.
     *
     * @param startRequests startRequests
     * @return this
     */
    public Spider startRequest(List startRequests) {
    	//检查是否已经运行
    	checkIfRunning();
    	//如果是request 不转换
        this.startRequests = startRequests;
        return this;
    }

    /**
     * Set an uuid for spider.

     * Default uuid is domain of site.

     *
     * @param uuid uuid
     * @return this
     */
    public Spider setUUID(String uuid) {
    	//设置任务
        this.uuid = uuid;
        return this;
    }

    /**
     * set scheduler for Spider
     *
     * @param scheduler scheduler
     * @return this
     * @see #setScheduler(us.codecraft.webmagic.scheduler.Scheduler)
     */
    @Deprecated
    public Spider scheduler(Scheduler scheduler) {
        return setScheduler(scheduler);
    }

    /**
     * set scheduler for Spider
     *
     * @param scheduler scheduler
     * @return this
     * @see Scheduler
     * @since 0.2.1
     */
    public Spider setScheduler(Scheduler scheduler) {
    	//检查是否运行
        checkIfRunning();
        Scheduler oldScheduler = this.scheduler;
        this.scheduler = scheduler;
        //可以理解 如果老的scheduler 还有请求没有去fetch 新的会把剩余加进去
        //老队列吐出一个请求 新队列加一个
        if (oldScheduler != null) {
            Request request;
            while ((request = oldScheduler.poll(this)) != null) {
                this.scheduler.push(request, this);
            }
        }
        return this;
    }

    /**
     * add a pipeline for Spider
     *
     * @param pipeline pipeline
     * @return this
     * @see #addPipeline(us.codecraft.webmagic.pipeline.Pipeline)
     * @deprecated
     */
    public Spider pipeline(Pipeline pipeline) {
        return addPipeline(pipeline);
    }

    /**
     * add a pipeline for Spider
     *
     * @param pipeline pipeline
     * @return this
     * @see Pipeline
     * @since 0.2.1
     */
    public Spider addPipeline(Pipeline pipeline) {
        checkIfRunning();
        this.pipelines.add(pipeline);
        return this;
    }

    /**
     * set pipelines for Spider
     *
     * @param pipelines pipelines
     * @return this
     * @see Pipeline
     * @since 0.4.1
     */
    public Spider setPipelines(List pipelines) {
        checkIfRunning();
        //加入批量数据持久化
        this.pipelines = pipelines;
        return this;
    }

    /**
     * clear the pipelines set
     *
     * @return this
     */
    public Spider clearPipeline() {
        pipelines = new ArrayList();
        return this;
    }

    /**
     * set the downloader of spider
     *
     * @param downloader downloader
     * @return this
     * @see #setDownloader(us.codecraft.webmagic.downloader.Downloader)
     * @deprecated
     */
    public Spider downloader(Downloader downloader) {
        return setDownloader(downloader);
    }

    /**
     * set the downloader of spider
     *
     * @param downloader downloader
     * @return this
     * @see Downloader
     */
    public Spider setDownloader(Downloader downloader) {
        checkIfRunning();
        this.downloader = downloader;
        return this;
    }
    
    //初始化
    protected void initComponent() {
        if (downloader == null) {
            this.downloader = new HttpClientDownloader();
        }
        if (pipelines.isEmpty()) {
            pipelines.add(new ConsolePipeline());
        }
        //默认一个线程
        downloader.setThread(threadNum);
        //生成线程池
        if (threadPool == null || threadPool.isShutdown()) {
            if (executorService != null && !executorService.isShutdown()) {
                threadPool = new CountableThreadPool(threadNum, executorService);
            } else {
                threadPool = new CountableThreadPool(threadNum);
            }
        }
        //批量加入初始链接
        if (startRequests != null) {
            for (Request request : startRequests) {
                addRequest(request);
            }
            startRequests.clear();
        }
        //初始化开始时间
        startTime = new Date();
    }
    
    //同步
    @Override
    public void run() {
    	//检查状态
        checkRunningStat();
        //初始化
        initComponent();
        logger.info("Spider {} started!",getUUID());
        //已经运行并且测试线程 Thread 对象 是否已经是中断状态，但不清楚状态标志。
        while (!Thread.currentThread().isInterrupted() && stat.get() == STAT_RUNNING) {
            //默认是初始化队列scheduler  把任务放进去
        	final Request request = scheduler.poll(this);
            if (request == null) {
                if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
                    break;
                }
                // wait until new url added
                waitNewUrl();
            } else {
            	//异步
                threadPool.execute(new Runnable() {
                    @Override
                    public void run() {
                        try {
                        	//处理
                            processRequest(request);
                            //计数 多少页下载一般最后下完完毕都可以看到
                            onSuccess(request);
                        } catch (Exception e) {
                            onError(request);
                            logger.error("process request " + request + " error", e);
                        } finally {
                        	//总计
                            pageCount.incrementAndGet();
                            //等待新的url
                            signalNewUrl();
                        }
                    }
                });
            }
        }
        //完成就设置状态over
        stat.set(STAT_STOPPED);
        // release some resources
        if (destroyWhenExit) {
            close();
        }
        logger.info("Spider {} closed! {} pages downloaded.", getUUID(), pageCount.get());
    }
    
    //失败计数
    protected void onError(Request request) {
        if (CollectionUtils.isNotEmpty(spiderListeners)) {
            for (SpiderListener spiderListener : spiderListeners) {
                spiderListener.onError(request);
            }
        }
    }

   //成功计数
    protected void onSuccess(Request request) {
        if (CollectionUtils.isNotEmpty(spiderListeners)) {
            for (SpiderListener spiderListener : spiderListeners) {
                spiderListener.onSuccess(request);
            }
        }
    }
    
    //检查初始化状态
    private void checkRunningStat() {
        while (true) {
            int statNow = stat.get();
            if (statNow == STAT_RUNNING) {
                throw new IllegalStateException("Spider is already running!");
            }
           // CAS有3个操作数，内存值V，旧的预期值A，要修改的新值B。当且仅当预期值A和内存值V相同时，将内存值V修改为B，否则什么都不做。
            if (stat.compareAndSet(statNow, STAT_RUNNING)) {
                break;
            }
        }
    }
    
    //停止方法
    public void close() {
        destroyEach(downloader);
        destroyEach(pageProcessor);
        destroyEach(scheduler);
        for (Pipeline pipeline : pipelines) {
            destroyEach(pipeline);
        }
        threadPool.shutdown();
    }
    
    //对象销毁
    private void destroyEach(Object object) {
        if (object instanceof Closeable) {
            try {
                ((Closeable) object).close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /**
     * Process specific urls without url discovering.
     *
     * @param urls urls to process
     */
    public void test(String... urls) {
        initComponent();
        if (urls.length > 0) {
            for (String url : urls) {
                processRequest(new Request(url));
            }
        }
    }
    
    //处理请求
    private void processRequest(Request request) {
        Page page = downloader.download(request, this);
        if (page.isDownloadSuccess()){
            onDownloadSuccess(request, page);
        } else {
            onDownloaderFail(request);
        }
    }
    
    //下载成功
    private void onDownloadSuccess(Request request, Page page) {
    	//包含响应码
        if (site.getAcceptStatCode().contains(page.getStatusCode())){
            //处理
        	pageProcessor.process(page);
        	//不知道
            extractAndAddRequests(page, spawnUrl);
            if (!page.getResultItems().isSkip()) {
                for (Pipeline pipeline : pipelines) {
                	//多pipeline 处理数据
                    pipeline.process(page.getResultItems(), this);
                }
            }
        } else {
            logger.info("page status code error, page {} , code: {}", request.getUrl(), page.getStatusCode());
        }
        //site 设值 休眠
        sleep(site.getSleepTime());
        return;
    }
    
    //下载失败
    private void onDownloaderFail(Request request) {
    	//放弃
        if (site.getCycleRetryTimes() == 0) {
            sleep(site.getSleepTime());
        } else {
            // for cycle retry
        	//重试
            doCycleRetry(request);
        }
    }
    
    //重试
    private void doCycleRetry(Request request) {
    	//重试参数放在 request extra  key='_cycle_tried_times'
        Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
        if (cycleTriedTimesObject == null) {
            addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
        } else {
            int cycleTriedTimes = (Integer) cycleTriedTimesObject;
            cycleTriedTimes++;
            if (cycleTriedTimes < site.getCycleRetryTimes()) {
                addRequest(SerializationUtils.clone(request).setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
            }
        }
        sleep(site.getRetrySleepTime());
    }

    protected void sleep(int time) {
        try {
            Thread.sleep(time);
        } catch (InterruptedException e) {
            logger.error("Thread interrupted when sleep",e);
        }
    }

    protected void extractAndAddRequests(Page page, boolean spawnUrl) {
        if (spawnUrl && CollectionUtils.isNotEmpty(page.getTargetRequests())) {
            for (Request request : page.getTargetRequests()) {
                addRequest(request);
            }
        }
    }
    
    //加入请求
    private void addRequest(Request request) {
    	//如果没有域名自动给你加域名
        if (site.getDomain() == null && request != null && request.getUrl() != null) {
            site.setDomain(UrlUtils.getDomain(request.getUrl()));
        }
        //加入scheduler 默认是队列
        scheduler.push(request, this);
    }
    
    //检查状态
    protected void checkIfRunning() {
        if (stat.get() == STAT_RUNNING) {
            throw new IllegalStateException("Spider is already running!");
        }
    }
    
    //异步启动
    public void runAsync() {
        Thread thread = new Thread(this);
        thread.setDaemon(false);
        thread.start();
    }

    /**
     * Add urls to crawl. 

     *
     * @param urls urls
     * @return this
     */
    //url 转换成request 
    //例如 addurl(url1).addurl(url2)
    public Spider addUrl(String... urls) {
        for (String url : urls) {
            addRequest(new Request(url));
        }
        signalNewUrl();
        return this;
    }

    /**
     * Download urls synchronizing.
     *
     * @param urls urls
     * @param  type of process result
     * @return list downloaded
     */
    public  List getAll(Collection urls) {
        destroyWhenExit = false;
        spawnUrl = false;
        if (startRequests!=null){
            startRequests.clear();
        }
        for (Request request : UrlUtils.convertToRequests(urls)) {
            addRequest(request);
        }
        CollectorPipeline collectorPipeline = getCollectorPipeline();
        pipelines.add(collectorPipeline);
        run();
        spawnUrl = true;
        destroyWhenExit = true;
        return collectorPipeline.getCollected();
    }

    protected CollectorPipeline getCollectorPipeline() {
        return new ResultItemsCollectorPipeline();
    }

    public  T get(String url) {
        List urls = WMCollections.newArrayList(url);
        List resultItemses = getAll(urls);
        if (resultItemses != null && resultItemses.size() > 0) {
            return resultItemses.get(0);
        } else {
            return null;
        }
    }

    /**
     * Add urls with information to crawl.

     *
     * @param requests requests
     * @return this
     */
    //批量加入
    public Spider addRequest(Request... requests) {
        for (Request request : requests) {
            addRequest(request);
        }
        signalNewUrl();
        return this;
    }
    
    //等待新的url
    private void waitNewUrl() {
        newUrlLock.lock();
        try {
            //double check
            if (threadPool.getThreadAlive() == 0 && exitWhenComplete) {
                return;
            }
            newUrlCondition.await(emptySleepTime, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            logger.warn("waitNewUrl - interrupted, error {}", e);
        } finally {
            newUrlLock.unlock();
        }
    }
    
    private void signalNewUrl() {
        try {
            newUrlLock.lock();
            ///唤醒 await()等待队列中所有的线程
            newUrlCondition.signalAll();
        } finally {
            newUrlLock.unlock();
        }
    }
    
    //=runAsync（） 异步启动
    public void start() {
        runAsync();
    }
    
    //停止爬虫
    public void stop() {
    	//状态加1
        if (stat.compareAndSet(STAT_RUNNING, STAT_STOPPED)) {
            logger.info("Spider " + getUUID() + " stop success!");
        } else {
            logger.info("Spider " + getUUID() + " stop fail!");
        }
    }

    /**
     * start with more than one threads
     *
     * @param threadNum threadNum
     * @return this
     */
    //线程数
    public Spider thread(int threadNum) {
        checkIfRunning();
        this.threadNum = threadNum;
        if (threadNum <= 0) {
            throw new IllegalArgumentException("threadNum should be more than one!");
        }
        return this;
    }

    /**
     * start with more than one threads
     *
     * @param executorService executorService to run the spider
     * @param threadNum threadNum
     * @return this
     */
    //重载方法 加入executorService 自定义线程池
    public Spider thread(ExecutorService executorService, int threadNum) {
        checkIfRunning();
        this.threadNum = threadNum;
        if (threadNum <= 0) {
            throw new IllegalArgumentException("threadNum should be more than one!");
        }
        this.executorService = executorService;
        return this;
    }
    
    //是否完全
    public boolean isExitWhenComplete() {
        return exitWhenComplete;
    }

    /**
     * Exit when complete. 

     * True: exit when all url of the site is downloaded. 

     * False: not exit until call stop() manually.

     *
     * @param exitWhenComplete exitWhenComplete
     * @return this
     */
    public Spider setExitWhenComplete(boolean exitWhenComplete) {
        this.exitWhenComplete = exitWhenComplete;
        return this;
    }

    public boolean isSpawnUrl() {
        return spawnUrl;
    }

    /**
     * Get page count downloaded by spider.
     *
     * @return total downloaded page count
     * @since 0.4.1
     */
    public long getPageCount() {
        return pageCount.get();
    }

    /**
     * Get running status by spider.
     *
     * @return running status
     * @see Status
     * @since 0.4.1
     */
    //爬虫状态获取
    public Status getStatus() {
        return Status.fromValue(stat.get());
    }

    //状态枚举内部类
    public enum Status {
        Init(0), Running(1), Stopped(2);

        private Status(int value) {
            this.value = value;
        }

        private int value;

        int getValue() {
            return value;
        }

        public static Status fromValue(int value) {
            for (Status status : Status.values()) {
                if (status.getValue() == value) {
                    return status;
                }
            }
            //default value
            return Init;
        }
    }

    /**
     * Get thread count which is running
     *
     * @return thread count which is running
     * @since 0.4.1
     */
    //获取存活线程数
    public int getThreadAlive() {
        if (threadPool == null) {
            return 0;
        }
        return threadPool.getThreadAlive();
    }

    /**
     * Whether add urls extracted to download.

     * Add urls to download when it is true, and just download seed urls when it is false. 

     * DO NOT set it unless you know what it means!
     *
     * @param spawnUrl spawnUrl
     * @return this
     * @since 0.4.0
     */
    public Spider setSpawnUrl(boolean spawnUrl) {
        this.spawnUrl = spawnUrl;
        return this;
    }

    @Override
    public String getUUID() {
        if (uuid != null) {
            return uuid;
        }
        if (site != null) {
            return site.getDomain();
        }
        uuid = UUID.randomUUID().toString();
        return uuid;
    }

    public Spider setExecutorService(ExecutorService executorService) {
        checkIfRunning();
        this.executorService = executorService;
        return this;
    }

    @Override
    public Site getSite() {
        return site;
    }

    public List getSpiderListeners() {
        return spiderListeners;
    }

    public Spider setSpiderListeners(List spiderListeners) {
        this.spiderListeners = spiderListeners;
        return this;
    }

    public Date getStartTime() {
        return startTime;
    }

    public Scheduler getScheduler() {
        return scheduler;
    }

    /**
     * Set wait time when no url is polled.


     *
     * @param emptySleepTime In MILLISECONDS.
     */
    public void setEmptySleepTime(int emptySleepTime) {
        this.emptySleepTime = emptySleepTime;
    }
}

python 爬取preview的信息 YHFJerry python 开发语言
Python,HTTP相关视频讲解：python的or运算赋值用法用python编程Excel有没有用处？011_编程到底好玩在哪？查看python文件_输出py文件_cat_运行python文件_shelPython爬取Preview的信息在当今互联网时代，信息的获取变得异常方便，爬虫技术成为了一种非常重要的手段。Python作为一门强大的编程语言，被广泛用于网络爬虫的开发。本文将介绍如何使用P
python爬虫从入门到精通大模型猫叔 python 爬虫数据库
目录一、正确认识Python爬虫二、了解爬虫的本质1.熟悉Python编程2.了解HTML3.了解网络爬虫的基本原理4.学习使用Python爬虫库三、了解非结构化数据的存储1.本地文件2.数据库四、掌握各种技巧，应对特殊网站的反爬措施1.User-Agent2.Cookies3.IP代理五、学习爬虫框架，搭建工程化的爬虫1.创建Scrapy项目2.创建Spider3.编写Spider4.运行Spi
python爬虫入门（小白五分钟从入门到精通）一百天成为python专家 python 爬虫开发语言网络爬虫 python3.11 ipython
网络爬虫的介绍本节主要介绍Pytbon语言中支持网络爬虫的库,此外还将介绍如何获取网站的爬取规则，读者在学习和践过程中一定要严格遵守网站提供的爬取规则。网络爬虫网络爬虫通俗来讲就是使用代码将HTML网页的内容下载到本地的过程。爬取网页主要是为了获取网中的关键信息，例如网页中的数据、图片、视频等。Python语言中提供了多个具有爬虫功能的库，下面将具urHIib库:是Python自带的标准库，无须下
网络爬虫——python爬取豆瓣评论 SSeaflower 爬虫 python 开发语言
网络爬虫——python爬取豆瓣评论一、网络爬虫概述1.1网络爬虫定义网络爬虫，又被称为网络蜘蛛（WebSpider）、网络机器人等。它根据网页地址（URL）爬取网页内容，网页地址（URL）就是我们在浏览器中输入的网站链接。例如：https://www.baidu.com；https://movie.douban.com/。网络爬虫不仅能够复制网页信息和下载音视频，还可以做到网站的模拟登录和行为链
标题 “Python 网络爬虫 —— selenium库驱动浏览器 WeiJingYu. python 爬虫 selenium
一、Selenium库核心认知Selenium库是Web应用程序测试与自动化操作的利器，能驱动浏览器（如Edge、Firefox等）执行点击、输入、打开、验证等操作。与Requests库差异显著：Requests库仅能获取网页原始代码，而Selenium基于浏览器驱动程序工作，浏览器可渲染网页源代码，借此能轻松拿到渲染后的数据信息（如JS动态加载内容），完美解决Requests库无法处理的动态页面
Python网络爬虫实现selenium对百度识图二次开发以及批量保存Excel WeiJingYu. python 爬虫 selenium
一.百度识图自动上传图片fromseleniumimportwebdriverfromselenium.webdriver.edge.optionsimportOptionsfromselenium.webdriver.common.byimportByedge_options=Options()edge_options.binary_location=r"C:\ProgramFiles(x86)
Python 网络爬虫 —— 代理服务器 WeiJingYu. 爬虫服务器前端
一、会话（Session）（一）核心逻辑HTTP本身无记忆，每次请求独立。会话（Session）就是为解决这问题，让客户端（浏览器）和服务器“记住”交互状态（比如登录态），常用Cookie实现：服务器发Cookie给客户端存着，下次请求带着，服务器就知道“是同一用户”。（二）创建会话（requests实现）用requests库的Session类，自动维持会话、管理Cookie，代码形式：impor
网络爬虫-07 YEGE学AI算法 Python-网络爬虫
网络爬虫-07）**Spider06回顾****scrapy框架****完成scrapy项目完整流程****我们必须记住****爬虫项目启动方式****数据持久化存储****Spider07笔记****分布式爬虫****scrapy_redis详解****腾讯招聘分布式改写****机器视觉与tesseract****补充-滑块缺口验证码案例****豆瓣网登录****Fiddler抓包工具****移
python大数据论文_大数据环境下基于python的网络爬虫技术 weixin_39775976 python大数据论文
软件开发大数据环境下基于python的网络爬虫技术作者/谢克武，重庆工商大学派斯学院软件工程学院摘要：随着互联网的发展壮大，网络数据呈爆炸式增长，传统捜索引擎已经不能满足人们对所需求数据的获取的需求，作为搜索引擎的抓取数据的重要组成部分，网络爬虫的作用十分重要，本文首先介绍了在大数据环境下网络爬虫的重要性，接着介绍了网络爬虫的概念，工作原理，工作流程，网页爬行策略，python在编写爬虫领域的优势
网络爬虫：技术原理、应用场景与合法使用全攻略程序小武 python爬虫入门爬虫网络
爬虫是什么？网络爬虫（WebScraping或WebCrawling）是一种通过自动化方式从网站上抓取公开数据的程序。它通过模拟用户在浏览器中浏览网页的过程，访问网页、提取信息，并将数据保存到本地系统中。爬虫技术广泛应用于搜索引擎、数据收集、市场分析、信息聚合等多个领域。爬虫能做什么？数据收集爬虫可以高效地从互联网上的大量网站收集信息。比如，抓取新闻网站上的文章内容、商品电商平台的价格与库存数据、
python网络爬虫(第一章/共三章：网络爬虫库、robots.txt规则（防止犯法）、查看获取网页源代码)
python网络爬虫(第一章/共三章：网络爬虫库、robots.txt规则（防止犯法）、查看获取网页源代码)学习python网络爬虫的完整路径：（第一章即此篇文章）（第二章）python网络爬虫(第二章/共三章：安装浏览器驱动，驱动浏览器加载网页、批量下载资源)-CSDN博客https://blog.csdn.net/2302_78022640/article/details/149431071?
Python爬虫实战：使用最新技术爬取新华网新闻数据 Python爬虫项目 2025年爬虫实战项目 python 爬虫开发语言 scrapy 音视频
一、前言在当今信息爆炸的时代，网络爬虫技术已经成为获取互联网数据的重要手段。作为国内权威新闻媒体，新华网每天发布大量高质量的新闻内容，这些数据对于舆情分析、市场研究、自然语言处理等领域具有重要价值。本文将详细介绍如何使用Python最新技术构建一个高效、稳定的新华网新闻爬虫系统。二、爬虫技术选型2.1技术栈选择在构建新华网爬虫时，我们选择了以下技术栈：请求库：httpx（支持HTTP/2，异步请求
python 计算生态概览的概述
文章目录前言python计算生态库的介绍1.网络爬虫2.数据分析3.文本处理4.数据可视化5.机器学习6.图形用户界面7.游戏开发8.网络应用开发前言python计算生态概览的解释Python计算生态概览是对Python作为一门强大而广泛使用的编程语言所拥有的庞大软件集合的整体描述和概述。这个生态体系不仅包含了Python的标准库（stdlib），即随Python解释器安装的基本模块，还涵盖了极其
Python生态全景图：8大主流框架优缺点及选型指南 Sammyyyyy python 开发语言 django fastapi flask
引言：Python的“万能”生态Python为何能成为当今最流行的编程语言之一？答案并非其语法本身，而在于其强大且多样化的框架生态。这个生态系统如同一片繁荣的大陆，覆盖了从Web后端到人工智能的几乎所有技术领域，让开发者能用一种语言胜任多种截然不同的任务。本文将化作一张“技术地图”，快速带你游览Python在Web开发、数据科学和网络爬虫三大领域的8个标志性框架。我们的目标是迅速掌握它们的精髓，让
Python 网络爬虫中 robots 协议使用的常见问题及解决方法
在Python网络爬虫开发中，robots协议的正确应用是保证爬虫合规性的关键。然而，在实际使用过程中，开发者常会遇到各种问题，若处理不当，可能导致爬虫被封禁或引发法律风险。本文将梳理robots协议使用中的常见问题，并提供针对性的解决方法。一、协议解析不准确导致的合规性问题1.1误读User-agent通配符范围问题表现：将User-agent:*错误理解为适用于所有场景，忽略了特定爬虫的单独规
Julia爬取数据能力及应用场景 q56731523 julia 开发语言
Julia是一种高性能编程语言，特别适合数值计算和数据分析。然而，关于数据爬取（即网络爬虫）方面，我们需要明确以下几点：虽然它是一门通用编程语言，但它的强项不在于网络爬取（WebScraping）这类任务。而且Julia的生态系统在爬虫方面还不够成熟和丰富。所以说Julia爬取数据后立即进行高性能的数据分析这点还是有一些优势。Julia虽然以高性能数值计算和数据分析见长，但它同样具备网络爬取（We
Python 网络爬虫的基本流程及 robots 协议详解女码农的重启 python 网络爬虫 JAVA 开发语言
数据驱动的时代，网络爬虫作为高效获取互联网信息的工具，其规范化开发离不开对基本流程的掌握和对robots协议的遵守。本文将系统梳理Python网络爬虫的核心流程，并深入解读robots协议的重要性及实践规范。一、Python网络爬虫的基本流程Python网络爬虫的工作过程可分为四个核心阶段，每个阶段环环相扣，共同构成数据采集的完整链路。1.1发起网络请求这是爬虫与目标服务器交互的第一步，通过发送H
Python爬虫实战：使用最新技术爬取头条新闻数据 Python爬虫项目 2025年爬虫实战项目 python 爬虫开发语言 scrapy 音视频
一、前言：Python爬虫在现代数据获取中的重要性在当今信息爆炸的时代，数据已经成为最宝贵的资源之一。作为数据获取的重要手段，网络爬虫技术在各个领域发挥着越来越重要的作用。Python凭借其简洁的语法、丰富的库生态系统和强大的社区支持，已经成为网络爬虫开发的首选语言。本文将详细介绍如何使用Python及其最新的爬虫技术来爬取头条新闻数据。我们将从基础概念讲起，逐步深入到高级技巧，最后给出完整的爬虫
Vlang编写爬虫可行性分析
最近有人问V(Vlang)语言可以用来做数据采集么，那么我在这里明确告诉你，V(Vlang)完全可以用来编写网络爬虫。虽然它主打的是系统编程语言，但其设计目标包括简洁、高效和实用性，这使得它在处理像爬虫这样的网络任务时也表现出色。V的并发模型适合高并发爬虫，但实际效果待测试。最后给出一个简单例子展示基础流程，同时指出生态限制，避免用户期望过高。个人建议如果项目复杂，可能选Python更省力，毕竟p
Python爬企查查网站数据的爬虫代码如何写？ cda2024 python 爬虫开发语言
在大数据时代，数据的获取与分析变得尤为重要。企业信息查询平台“企查查”作为国内领先的企业信用信息查询工具，提供了丰富的企业数据资源。对于数据科学家和工程师而言，能够从这些平台高效地抓取数据，无疑是一项重要的技能。本文将详细介绍如何使用Python编写爬虫代码，从企查查网站抓取企业数据，并探讨其中的技术难点和解决方案。为什么选择Python？Python是一门广泛应用于数据科学和网络爬虫开发的语言，
Python爬虫实战：研究HTTP Agent Parser 库相关技术 ylfhpy 爬虫项目实战 python 爬虫 http
1.引言1.1研究背景与意义在当今数字化时代，网络数据作为一种重要的信息资源，在商业决策、学术研究、社会分析等领域发挥着越来越重要的作用。网络爬虫作为一种自动获取网页内容的技术，成为了获取这些数据的重要工具。然而，随着网络爬虫的广泛使用，网站也采取了各种反爬机制来保护自身数据和服务安全。其中，用户代理（User-Agent）检测是一种常见的反爬手段。网站通过分析请求的User-Agent信息，识别
Python爬虫实战：研究pyparsing工具相关技术 ylfhpy 爬虫项目实战 python 爬虫开发语言 pyparsing 文本处理文本分析
1.引言在当今信息爆炸的时代，网络上存在着海量的非结构化文本数据。如何从这些数据中提取有价值的信息，成为了数据科学领域的一个重要研究方向。网络爬虫技术可以帮助我们自动获取这些数据，而Pyparsing则提供了强大的语法分析能力，可以将非结构化的文本转换为结构化的信息。本文将介绍一个完整的案例，展示如何使用Python的爬虫技术结合Pyparsing工具，构建一个网络内容分析系统。该系统可以爬取特定
156个Python网络爬虫资源，妈妈再也不用担心你找不到资源！_爬虫 csdn资源
本列表包含Python网页抓取和数据处理相关的库。网络相关通用urllib-网络库(标准库)requests-网络库grab-网络库(基于pycurl)pycurl-网络库(与libcurl绑定)urllib3-具有线程安全连接池、文件psot支持、高可用的PythonHTTP库httplib2-网络库RoboBrowser-一个无需独立浏览器即可访问网页的简单、pythonic的库Mechani
Vlang编写轻量化多线程爬虫 q56731523 爬虫 typescript 开发语言前端
Vlang作为新兴语言，他简单、快速和安全让爬虫有不一样的体验。在V中，并发模型基于轻量级的协程（称为goroutines，类似于Go语言的goroutine）和通道（channels）来实现。虽然说V语言目前还在快速发展中，但它的并发特性已经可以用于构建多线程（实际上是协程）应用程序，例如网络爬虫。所以说，用V语言（Vlang）完全支持编写多线程爬虫。它提供了强大的并发模型和网络库，非常适合高效
python笔记-Selenium谷歌浏览器驱动下载 hero.zhong python 笔记 selenium
Selenium谷歌浏览器驱动下载地址：https://googlechromelabs.github.io/chrome-for-testing/#stable下面是遇到的问题：python网络爬虫技术中使用谷歌浏览器代码，报错：OSError:[WinError193]%1不是有效的Win32应用程序：遇到错误OSError:[WinError193]%1不是有效的Win32应用程序通常意味着
爬虫的笔记整理咸鱼时日翻身爬虫笔记
网络爬虫首先要认识http和https协议在浏览器中发送一个http请求：1.输入一个URL地址之后，向http服务器发送请求，主要分为GET和POST两种方法2.输入URL之后，发送一个request请求，这时候服务器把response文件对象发送回浏览器3.浏览器中解析返回的HTML，其中引用了许多的其他文件，images，css文件，JS文件等，再次法中request去获取这些内容4.所有的
Java简易爬虫：抓取京东图书信息实战指南黃昱儒
本文还有配套的精品资源，点击获取简介：本项目展示如何使用Java语言创建一个网络爬虫来抓取京东网站的图书信息。介绍使用Maven作为构建工具，HTTP客户端库发送请求，以及Jsoup或类似库解析HTML内容。讲解如何处理JavaScript动态加载内容，绕过反爬机制，并讨论数据存储和用户界面设计的策略。1.Java网络爬虫项目概述网络爬虫是一种自动获取网页内容的程序，它按照一定的规则，自动抓取互联
Java爬虫技术详解：原理、实现与优势 cyc&阿灿 Java 多线程 java 爬虫开发语言
一、什么是网络爬虫？网络爬虫（WebCrawler），又称网络蜘蛛或网络机器人，是一种自动化程序，能够按照一定的规则自动浏览和抓取互联网上的信息。爬虫技术是大数据时代获取网络数据的重要手段，广泛应用于搜索引擎、数据分析、价格监控等领域。Java作为一种稳定、高效的编程语言，凭借其强大的网络编程能力和丰富的生态库，成为开发网络爬虫的热门选择。二、Java爬虫核心组件一个完整的Java爬虫通常包含以下
Python网络爬虫与数据处理工具大全：从入门到精通俞凯润
Python网络爬虫与数据处理工具大全：从入门到精通awesome-web-scrapingListoflibraries,toolsandAPIsforwebscrapinganddataprocessing.项目地址:https://gitcode.com/gh_mirrors/aw/awesome-web-scraping本文基于知名Python网络爬虫资源库lorien/awesome-w
【Python入门】极速爬取：用Python Autoscraper库简化网络数据抓取
️极速爬取：用PythonAutoscraper库简化网络数据抓取你是否梦想过能够以闪电般的速度从网上抓取数据，而无需深入了解复杂的爬虫技术？️是否想要一个简单易用的工具，让你快速上手网络爬虫，而不必担心代码的繁琐？如果你的答案是肯定的，那么这篇文章将是你的加速器。让我们一起探索Python的Autoscraper库，学习如何用它来简化网络数据抓取的过程。引言在信息爆炸的时代，能够快速从互联网上抓
apache ftpserver-CentOS config gengzg apache
<server xmlns="http://mina.apache.org/ftpserver/spring/v1" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation=" http://mina.apache.o
优化MySQL数据库性能的八种方法 AILIKES sql mysql
1、选取最适用的字段属性　　MySQL可以很好的支持大数据量的存取，但是一般说来，数据库中的表越小，在它上面执行的查询也就会越快。因此，在创建表的时候，为了获得更好的性能，我们可以将表中字段的宽度设得尽可能小。例如，在定义邮政编码这个字段时，如果将其设置为CHAR(255),显然给数据库增加了不必要的空间，甚至使用VARCHAR这种类型也是多余的，因为CHAR(6)就可以很
JeeSite 企业信息化快速开发平台 Kai_Ge JeeSite
JeeSite 企业信息化快速开发平台平台简介 JeeSite是基于多个优秀的开源项目，高度整合封装而成的高效，高性能，强安全性的开源Java EE快速开发平台。 JeeSite本身是以Spring Framework为核心容器，Spring MVC为模型视图控制器，MyBatis为数据访问层， Apache Shiro为权限授权层，Ehcahe对常用数据进行缓存，Activit为工作流
通过Spring Mail Api发送邮件 120153216 邮件 main
原文地址：http://www.open-open.com/lib/view/open1346857871615.html 使用Java Mail API来发送邮件也很容易实现，但是最近公司一个同事封装的邮件API实在让我无法接受，于是便打算改用Spring Mail API来发送邮件，顺便记录下这篇文章。【Spring Mail API】 Spring Mail API都在org.spri
Pysvn 程序员使用指南 2002wmj SVN
源文件:http://ju.outofmemory.cn/entry/35762 这是一篇关于pysvn模块的指南. 完整和详细的API请参考 http://pysvn.tigris.org/docs/pysvn_prog_ref.html. pysvn是操作Subversion版本控制的Python接口模块. 这个API接口可以管理一个工作副本, 查询档案库, 和同步两个. 该
在SQLSERVER中查找被阻塞和正在被阻塞的SQL 357029540 SQL Server
SELECT R.session_id AS BlockedSessionID , S.session_id AS BlockingSessionID , Q1.text AS Block
Intent 常用的用法备忘 7454103 .net android Google Blog F#
Intent 应该算是Android中特有的东西。你可以在Intent中指定程序要执行的动作（比如：view,edit,dial），以及程序执行到该动作时所需要的资料。都指定好后，只要调用startActivity()，Android系统会自动寻找最符合你指定要求的应用程序，并执行该程序。下面列出几种Intent 的用法显示网页:
Spring定时器时间配置 adminjun spring 时间配置定时器
红圈中的值由6个数字组成，中间用空格分隔。第一个数字表示定时任务执行时间的秒，第二个数字表示分钟，第三个数字表示小时，后面三个数字表示日，月，年，< xmlnamespace prefix ="o" ns ="urn:schemas-microsoft-com:office:office" /> 测试的时候，由于是每天定时执行，所以后面三个数
POJ 2421 Constructing Roads 最小生成树 aijuans 最小生成树
来源：http://poj.org/problem?id=2421 题意：还是给你n个点，然后求最小生成树。特殊之处在于有一些点之间已经连上了边。思路：对于已经有边的点，特殊标记一下，加边的时候把这些边的权值赋值为0即可。这样就可以既保证这些边一定存在，又保证了所求的结果正确。代码： #include <iostream> #include <cstdio>
重构笔记——提取方法（Extract Method） ayaoxinchao java 重构提炼函数局部变量提取方法
提取方法（Extract Method）是最常用的重构手法之一。当看到一个方法过长或者方法很难让人理解其意图的时候，这时候就可以用提取方法这种重构手法。下面是我学习这个重构手法的笔记：提取方法看起来好像仅仅是将被提取方法中的一段代码，放到目标方法中。其实，当方法足够复杂的时候，提取方法也会变得复杂。当然，如果提取方法这种重构手法无法进行时，就可能需要选择其他
为UILabel添加点击事件 bewithme UILabel
默认情况下UILabel是不支持点击事件的，网上查了查居然没有一个是完整的答案，现在我提供一个完整的代码。 UILabel *l = [[UILabel alloc] initWithFrame:CGRectMake(60, 0, listV.frame.size.width - 60, listV.frame.size.height)]
NoSQL数据库之Redis数据库管理(PHP-REDIS实例) bijian1013 redis 数据库 NoSQL
一.redis.php <?php //实例化 $redis = new Redis(); //连接服务器 $redis->connect("localhost"); //授权 $redis->auth("lamplijie"); //相关操
SecureCRT使用备注 bingyingao secureCRT 每页行数
SecureCRT日志和卷屏行数设置一、使用securecrt时，设置自动日志记录功能。 1、在C:\Program Files\SecureCRT\下新建一个文件夹(也就是你的CRT可执行文件的路径），命名为Logs； 2、点击Options -> Global Options -> Default Session -> Edite Default Sett
【Scala九】Scala核心三：泛型 bit1129 scala
泛型类 package spark.examples.scala.generics class GenericClass[K, V](val k: K, val v: V) { def print() { println(k + "," + v) } } object GenericClass { def main(args: Arr
素数与音乐 bookjovi 素数数学 haskell
由于一直在看haskell，不可避免的接触到了很多数学知识，其中数论最多，如素数，斐波那契数列等，很多在学生时代无法理解的数学现在似乎也能领悟到那么一点。闲暇之余，从图书馆找了<<The music of primes>>和<<世界数学通史>>读了几遍。其中素数的音乐这本书与软件界熟知的&l
Java-Collections Framework学习与总结-IdentityHashMap BrokenDreams Collections
这篇总结一下java.util.IdentityHashMap。从类名上可以猜到，这个类本质应该还是一个散列表，只是前面有Identity修饰，是一种特殊的HashMap。简单的说，IdentityHashMap和HashM
读《研磨设计模式》-代码笔记-享元模式-Flyweight bylijinnan java 设计模式
声明：本文只为方便我个人查阅和理解，详细的分析以及源代码请移步原作者的博客http://chjavach.iteye.com/ import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java
PS人像润饰&调色教程集锦 cherishLC PS
1、仿制图章沿轮廓润饰——柔化图像，凸显轮廓 http://www.howzhi.com/course/retouching/ 新建一个透明图层，使用仿制图章不断Alt+鼠标左键选点，设置透明度为21%，大小为修饰区域的1/3左右（比如胳膊宽度的1/3），再沿纹理方向（比如胳膊方向）进行修饰。所有修饰完成后，对该润饰图层添加噪声，噪声大小应该和
更新多个字段的UPDATE语句 crabdave update
更新多个字段的UPDATE语句 update tableA a set (a.v1, a.v2, a.v3, a.v4) = --使用括号确定更新的字段范围
hive实例讲解实现in和not in子句 daizj hive not in in
本文转自：http://www.cnblogs.com/ggjucheng/archive/2013/01/03/2842855.html 当前hive不支持 in或not in 中包含查询子句的语法，所以只能通过left join实现。假设有一个登陆表login(当天登陆记录,只有一个uid),和一个用户注册表regusers(当天注册用户，字段只有一个uid)，这两个表都包含
一道24点的10+种非人类解法（2,3,10,10） dsjt 算法
这是人类算24点的方法？！！！事件缘由：今天晚上突然看到一条24点状态，当时惊为天人，这NM叫人啊？以下是那条状态朱明西 : 24点，算2 3 10 10，我LX炮狗等面对四张牌痛不欲生，结果跑跑同学扫了一眼说，算出来了，2的10次方减10的3次方。。我草这是人类的算24点啊。。然后么。。。我就在深夜很得瑟的问室友求室友算刚出完题，文哥的暴走之旅开始了 5秒后
关于YII的菜单插件 CMenu和面包末breadcrumbs路径管理插件的一些使用问题 dcj3sjt126com yii framework
在使用 YIi的路径管理工具时，发现了一个问题。 <?php
对象与关系之间的矛盾：“阻抗失配”效应[转] come_for_dream 对象
概述 “阻抗失配”这一词组通常用来描述面向对象应用向传统的关系数据库（RDBMS）存放数据时所遇到的数据表述不一致问题。C++程序员已经被这个问题困扰了好多年，而现在的Java程序员和其它面向对象开发人员也对这个问题深感头痛。 “阻抗失配”产生的原因是因为对象模型与关系模型之间缺乏固有的亲合力。“阻抗失配”所带来的问题包括：类的层次关系必须绑定为关系模式（将对象
学习编程那点事 gcq511120594 编程互联网
一年前的夏天，我还在纠结要不要改行，要不要去学php？能学到真本事吗？改行能成功吗？太多的问题，我终于不顾一切，下定决心，辞去了工作，来到传说中的帝都。老师给的乘车方式还算有效，很顺利的就到了学校，赶巧了，正好学校搬到了新校区。先安顿了下来，过了个轻松的周末，第一次到帝都，逛逛吧！接下来的周一，是我噩梦的开始，学习内容对我这个零基础的人来说，除了勉强完成老师布置的作业外，我已经没有时间和精力去
Reverse Linked List II hcx2013 list
Reverse a linked list from position m to n. Do it in-place and in one-pass. For example:Given 1->2->3->4->5->NULL, m = 2 and n = 4, return
Spring4.1新特性——页面自动化测试框架Spring MVC Test HtmlUnit简介 jinnianshilongnian spring 4.1
目录 Spring4.1新特性——综述 Spring4.1新特性——Spring核心部分及其他 Spring4.1新特性——Spring缓存框架增强 Spring4.1新特性——异步调用和事件机制的异常处理 Spring4.1新特性——数据库集成测试脚本初始化 Spring4.1新特性——Spring MVC增强 Spring4.1新特性——页面自动化测试框架Spring MVC T
Hadoop集群工具distcp liyonghui160com
1. 环境描述两个集群：rock 和 stone rock无kerberos权限认证，stone有要求认证。 1. 从rock复制到stone，采用hdfs Hadoop distcp -i hdfs://rock-nn:8020/user/cxz/input hdfs://stone-nn:8020/user/cxz/运行在rock端，即源端问题：报版本
一个备份MySQL数据库的简单Shell脚本 pda158 mysql 脚本
　　主脚本（用于备份mysql数据库）：　　该Shell脚本可以自动备份数据库。只要复制粘贴本脚本到文本编辑器中，输入数据库用户名、密码以及数据库名即可。我备份数据库使用的是mysqlump 命令。后面会对每行脚本命令进行说明。　　 1. 分别建立目录“backup”和“oldbackup” 　　#mkdir /backup 　　#mkdir /oldbackup 　
300个涵盖IT各方面的免费资源（中）——设计与编码篇 shoothao IT资源图标库图片库色彩板字体
A. 免费的设计资源 Freebbble:来自于Dribbble的免费的高质量作品。 Dribbble:Dribbble上“免费”的搜索结果——这是巨大的宝藏。 Graphic Burger:每个像素点都做得很细的绝佳的设计资源。 Pixel Buddha:免费和优质资源的专业社区。 Premium Pixels:为那些有创意的人提供免费的素材。
thrift总结 - 跨语言服务开发 uule thrift
官网官网JAVA例子 thrift入门介绍 IBM-Apache Thrift - 可伸缩的跨语言服务开发框架 Thrift入门及Java实例演示 thrift的使用介绍 RPC POM： <dependency> <groupId>org.apache.thrift</groupId>

从头学习爬虫（十七）重构篇----WebMagic框架分析之spider

Spider

你可能感兴趣的:(网络爬虫)