爬虫百度图片并下载的程序——C#程序

爬虫百度图片并下载的程序——C#程序

1、首先po一下主界面——经过作者美化(残害)之后的界面

爬虫百度图片并下载的程序——C#程序_第1张图片主要控件及功能介绍:

  1. 左上角的一个groupBox控件,text为基本设置,里面的控件包括3个label,2个textbox,2个button和1个domainUpDown控件,布局如上,主要设置程序搜索得关键词,爬取的页数以及爬取的文件/图片存储的路径。
  2. 右上角的一个groupBox控件,text为筛选设置,里面的控件包括5个label控件,3个combobox,2个domainUpDown和1个checkbox控件,布局如上,主要设置是搜索中的筛选条件:图片主色调,图片类型和图片尺寸(默认是全部色调,全部尺寸和全部尺寸),当选中checkbox控件(自定义大小)时,开启宽和高的两个domainUpDown控件,默认这两个控件是的enable是false的。
  3. 其中筛选设置中的的下拉框如下图所示:与百度搜索中的高级设置的筛选条件一致。

爬虫百度图片并下载的程序——C#程序_第2张图片

爬虫百度图片并下载的程序——C#程序_第3张图片

       4、点击开始下载就将图片存储在指定的文件夹中,没爬取成功一次图片就会在左下角的richtextbox中显示一遍“图片下载成功!”,并且在右下角的richtextbox中显示网页原址源代码,下载完成后提示爬虫结束!。

爬虫百度图片并下载的程序——C#程序_第4张图片

爬虫百度图片并下载的程序——C#程序_第5张图片 

2、代码模块介绍

  1. 添加程序之后设计一个图片下载类如下:
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;

namespace csharpspider
{
    public class download
    {
        public static Stream DownloadFile(String URL)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL);
            request.KeepAlive = false;
            request.Timeout = 30 * 1000;
            //request需要自己去网页按F12找参数
            request.Method = "GET";
            request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3";
            request.Referer = "https://baike.baidu.com/item/vs/14494077?fr=aladdin";
            request.UserAgent = GetUA();

            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode != HttpStatusCode.OK)
            {
                return null;
            }
            return response.GetResponseStream();
        }


        public static  Boolean DownloadFile(string  URL,string  filename,string  referer)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(URL);
            request.KeepAlive = false;
            request.Timeout = 30 * 1000;
            request.Method = "GET";
            request.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3";
            request.Host = URL.Split('/')[2];
            request.Referer = referer;
            request.UserAgent = GetUA();
            HttpWebResponse response = (HttpWebResponse)request.GetResponse();
            if (response.StatusCode != HttpStatusCode.OK)
            {
                return false  ;
            }
            using (FileStream fs = new FileStream(filename, FileMode.Create))
            {
                response.GetResponseStream().CopyTo(fs);
            }
            return true;
        }

        //这个getUA的列表可以在网上找到,我这里直接给出了我用的一个
        private  static  string  GetUA()
        {
            string[] agents = {
  "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5",
  "Mozilla/5.0 (Windows; U; Windows NT 5.2; en-US) AppleWebKit/532.9 (KHTML, like Gecko) Chrome/5.0.310.0 Safari/532.9",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.7 (KHTML, like Gecko) Chrome/7.0.514.0 Safari/534.7",
  "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/9.0.601.0 Safari/534.14",
  "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.14 (KHTML, like Gecko) Chrome/10.0.601.0 Safari/534.14",
  "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.20 (KHTML, like Gecko) Chrome/11.0.672.2 Safari/534.20",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.27 (KHTML, like Gecko) Chrome/12.0.712.0 Safari/534.27",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.24 Safari/535.1",
  "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/535.2 (KHTML, like Gecko) Chrome/15.0.874.120 Safari/535.2",
  "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7",
  "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.10) Gecko/2009042316 Firefox/3.0.10",
  "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-GB; rv:1.9.0.11) Gecko/2009060215 Firefox/3.0.11 (.NET CLR 3.5.30729)",
  "Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 GTB5",
  "Mozilla/5.0 (Windows; U; Windows NT 5.1; tr; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0E)",
  "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
  "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
  "Mozilla/5.0 (Windows NT 5.1; rv:5.0) Gecko/20100101 Firefox/5.0",
  "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110622 Firefox/6.0a2",
  "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:7.0.1) Gecko/20100101 Firefox/7.0.1",
  "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:2.0b4pre) Gecko/20100815 Minefield/4.0b4pre",
  "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0 )",
  "Mozilla/4.0 (compatible; MSIE 5.5; Windows 98; Win 9x 4.90)",
  "Mozilla/5.0 (Windows; U; Windows XP) Gecko MultiZilla/1.6.1.0a",
  "Mozilla/2.02E (Win95; U)",
  "Mozilla/3.01Gold (Win95; I)",
  "Mozilla/4.8 [en] (Windows NT 5.1; U)",
  "Mozilla/5.0 (Windows; U; Win98; en-US; rv:1.4) Gecko Netscape/7.1 (ax)",
  "HTC_Dream Mozilla/5.0 (Linux; U; Android 1.5; en-ca; Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.2; U; de-DE) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/234.40.1 Safari/534.6 TouchPad/1.0",
  "Mozilla/5.0 (Linux; U; Android 1.5; en-us; sdk Build/CUPCAKE) AppleWebkit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 1.5; en-us; htc_bahamas Build/CRB17) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 2.1-update1; de-de; HTC Desire 1.19.161.5 Build/ERE27) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 1.5; de-ch; HTC Hero Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 2.1; en-us; HTC Legend Build/cupcake) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 1.5; de-de; HTC Magic Build/PLAT-RC33) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1 FirePHP/0.3",
  "Mozilla/5.0 (Linux; U; Android 1.6; en-us; HTC_TATTOO_A3288 Build/DRC79) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 1.0; en-us; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
  "Mozilla/5.0 (Linux; U; Android 1.5; en-us; T-Mobile G1 Build/CRB43) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari 525.20.1",
  "Mozilla/5.0 (Linux; U; Android 1.5; en-gb; T-Mobile_G2_Touch Build/CUPCAKE) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Droid Build/FRG22D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Milestone Build/ SHOLS_U2_01.03.1) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.0.1; de-de; Milestone Build/SHOLS_U2_01.14.0) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
  "Mozilla/5.0 (Linux; U; Android 0.5; en-us) AppleWebKit/522  (KHTML, like Gecko) Safari/419.3",
  "Mozilla/5.0 (Linux; U; Android 1.1; en-gb; dream) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
  "Mozilla/5.0 (Linux; U; Android 2.0; en-us; Droid Build/ESD20) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.1; en-us; Nexus One Build/ERD62) AppleWebKit/530.17 (KHTML, like Gecko) Version/4.0 Mobile Safari/530.17",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; Sprint APA9292KT Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-us; ADR6300 Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 2.2; en-ca; GT-P1000M Build/FROYO) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
  "Mozilla/5.0 (Linux; U; Android 3.0.1; fr-fr; A500 Build/HRI66) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
  "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/525.10  (KHTML, like Gecko) Version/3.0.4 Mobile Safari/523.12.2",
  "Mozilla/5.0 (Linux; U; Android 1.6; es-es; SonyEricssonX10i Build/R1FA016) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
  "Mozilla/5.0 (Linux; U; Android 1.6; en-us; SonyEricssonX10i Build/R1AA056) AppleWebKit/528.5  (KHTML, like Gecko) Version/3.1.2 Mobile Safari/525.20.1",
            };
            return  agents[new Random().Next(0, agents.Length)];
        }
    }
}

 2、设计界面窗口,如1.中的主界面,主窗口的代码如下:

using Newtonsoft.Json.Linq;
using System;
using System.IO;
using System.Linq;
using System.Text;
using System.Windows.Forms;


namespace csharpspider
{
    public partial class BaiduSpider : Form
    {
        public BaiduSpider()
        {
            InitializeComponent();
            c_color.SelectedIndex = 0;
            c_type.SelectedIndex = 0;
            c_size.SelectedIndex = 0;
        }

        private void bth_selectpath_Click(object sender, EventArgs e)
        {
            FolderBrowserDialog folderBrowserDialog = new FolderBrowserDialog();
            folderBrowserDialog.Description = "请选择文件保存的路径!";
            folderBrowserDialog.ShowNewFolderButton = true;
            folderBrowserDialog.SelectedPath = Environment.CurrentDirectory;
            if(folderBrowserDialog.ShowDialog()==System.Windows.Forms.DialogResult.OK)
            {
                t_savapath.Text = folderBrowserDialog.SelectedPath;
            }
        }

        private void bth_download_Click(object sender, EventArgs e)
        {
            //获取网页图片
            //download.DownloadFile("https://timgsa.baidu.com/timg?image&quality=80&size=b9999_10000&sec=1560071194921&di=fe7d7b17f1004775746a76b4f1ff8506&imgtype=0&src=http%3A%2F%2Fhbimg.b0.upaiyun.com%2F51cc616aa7a4f993fc8118e11cdd46aac0e552aad582-OU4nXN_fw658", @"C:\Users\DL\Desktop\1.jpg",
             //   "https://image.baidu.com/search/detail?ct=503316480&z=0&ipn=d&word=%E6%9F%AF%E5%8D%97%E5%9B%BE%E7%89%87&step_word=&hs=2&pn=2&spn=0&di=94380&pi=0&rn=1&tn=baiduimagedetail&is=0%2C0&istype=2&ie=utf-8&oe=utf-8&in=&cl=2&lm=-1&st=-1&cs=3353152369%2C422959170&os=2829110262%2C2023947094&simid=0%2C0&adpicid=0&lpn=0&ln=1739&fr=&fmq=1560061095410_D_R&fm=detail&ic=&s=undefined&hd=&latest=©right=&se=&sme=&tab=0&width=&height=&face=undefined&ist=&jit=&cg=&bdtype=0&oriquery=&objurl=http%3A%2F%2Fhbimg.b0.upaiyun.com%2F51cc616aa7a4f993fc8118e11cdd46aac0e552aad582-OU4nXN_fw658&fromurl=ippr_z2C%24qAzdH3FAzdH3Fi7wkwg_z%26e3Bv54AzdH3FrtgfAzdH3F8bdnam9l9AzdH3F&gsm=0&rpstart=0&rpnum=0&islist=&querylist=&force=undefined");
            //获取网页源码
            // Stream stream = download.DownloadFile("https://blog.csdn.net/u013039658/article/details/80673470");
            //using (StreamReader sr = new StreamReader(stream))
            //{
            //    richTextBox1.Text = sr.ReadToEnd();
            //}

            if (string.IsNullOrEmpty(t_keyworld.Text))
                {
                    MessageBox.Show("未输入关键词", "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
                    return;
                }
            if (string.IsNullOrEmpty(t_savapath.Text))
            {
                MessageBox.Show("未输入保存路径", "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
                return;
            }
            if(!Directory.Exists(t_savapath.Text))
            {
                try
                {
                    Directory.CreateDirectory(t_savapath.Text);
                }
                catch
                {
                    MessageBox.Show("输入保存路径有误", "错误", MessageBoxButtons.OK, MessageBoxIcon.Error);
                    return;
                }
            }

            downloads(Geturl());
            MessageBox.Show("爬虫结束!");
        }

        private void downloads(string url)
        {
            int page = (int)n_page.Value;
            for (int i = 0; i < page; i++)
            {
                string Url = url.Replace("[REPLACE]", (50 * i).ToString());
                Stream stream;
                try
                {
                    stream = download.DownloadFile(Url);
                }
                catch
                {
                    continue;
                }
                string html;
                try
                {
                    using (StreamReader sr = new StreamReader(stream))
                    {
                        html = sr.ReadToEnd();
                    }
                }
                catch
                {
                    continue;
                }
                richTextBox2.Text = "网页源码:" + "\n" + html;
                JObject jobj = JObject.Parse(html);
                JArray jarr = (JArray)jobj["data"];
                int num = 0;
                int time = 0;
                while(num<50&&time<10)
                {
                    time++;
                    for (int j = 0; j < jarr.Count; j++)
                    {
                        try
                        {
                            string picurl = jarr[0]["thumbURL"].ToString();
                            string savepath = Path.Combine(t_savapath.Text, ReName(picurl.Substring(picurl.LastIndexOf("/") + 1)));
                            string Picreferer = "http://image.baidu.com/";
                            if (jarr[j].ToString().Contains("replaceUrl"))
                            {
                                picurl = jarr[j]["replaceUrl"][0]["ObjURL"].ToString();
                                savepath = Path.Combine(t_savapath.Text, ReName(picurl.Substring(picurl.LastIndexOf("/") + 1)));
                                Picreferer = jarr[j]["replaceUrl"][0]["FromURL"].ToString();
                            }
                            if (File.Exists(savepath))
                                continue;
                            if (download.DownloadFile(picurl, savepath, Picreferer))
                            {
                                num++;
                                richTextBox1.Text += "图片下载成功!" + Environment.NewLine;
                            }
                            else
                            {
                                richTextBox1.Text += "图片下载失败!" + Environment.NewLine;
                            }
                        }
                        catch
                        {
                            continue;
                        }
                    }
                }
            }
         }

        private void c_manue_CheckedChanged(object sender, EventArgs e)
        {
                if (c_manue.Checked == true)
                {
                    c_size.Enabled = false;
                    n_width.Enabled = true;
                    n_hight.Enabled = true;
                }
                else
                {
                    c_size.Enabled = true;
                    n_width.Enabled = false;
                    n_hight.Enabled = false;
                }
        }

        public string ReName(string FileAdress)
    {
            //这些是window中文件命名不允许出现的字符,本程序是直接用图片网页的URL最后部分作为图片名称,
            //所以需要设计这个类来规范化图片的存储名称
        FileAdress = FileAdress.Replace(":", "_");
        FileAdress = FileAdress.Replace("*", "_");
        FileAdress = FileAdress.Replace("?", "_");
        FileAdress = FileAdress.Replace("\\", "_");
        FileAdress = FileAdress.Replace("/", "_");
        FileAdress = FileAdress.Replace("<", "_");
        FileAdress = FileAdress.Replace(">", "_");
        FileAdress = FileAdress.Replace("|", "_");
        FileAdress = FileAdress.Replace(" ", "_");
        return FileAdress;
    }

  
        private string Geturl()
        {
            string queryWorld, world;
            queryWorld = world = UrlEncode(t_keyworld.Text);
            string z;
            //百度搜索得筛选设置中的几个尺寸类型分别对应的是全部类型,小型,中型,大型,特大型尺寸,依次对应无,1 2 3 9
            z = new string[] { " ", "1", "2", "3", "9" }[c_size.SelectedIndex];
            string ic = "";
            if(c_color.SelectedIndex!=0)
            {
                ic = Convert.ToString(Math.Pow(2, c_color.SelectedIndex - 1));
            }
            //百度搜索得筛选设置中颜色的设置函数
            string st, s, face, lm;
            st = "-1";
            s = face = lm= "";
            switch (c_type.SelectedIndex)
            {
                case 1: s = "1";break;
                case 2: face = "1"; break;
                case 3: st = "1"; break;
                case 4: st = "2"; break;
                case 5: lm = "6"; break;
                case 6: lm= "7"; break;
            }

            string width, hight;
            width = hight = "";
            if(c_manue.Checked)
            {
                width = n_width.Value.ToString();
                hight = n_hight.Value.ToString();
            }
            //有一个format方法获取我们需要的URL
            string URL = string.Format("https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={0}&cl=2&lm={1}&ie=utf-8&oe=utf-8&adpicid=&st={2}&z={3}ic={4}&hd=0&latest=0©right=0&word={5}&s={6}&se=&tab=&width={7}&height={8}&face={9}&istype=2&qc=&nc=1&fr=&expermode=&force=&pn=[REPLACE]&rn=30&gsm=1e&1560065305311=",queryWorld,lm,st,z,ic,world,s,width,hight,face);
            return URL;
        }

        //解码URL中的中文字符
        private  string UrlEncode(string  str)
        {
            StringBuilder sb = new StringBuilder();
            byte[] bystr = System.Text.Encoding.UTF8.GetBytes(str);
            for(int i=0;i

代码打包见:https://download.csdn.net/download/weixin_40695088/11235803

最后,还是把这个绿色的、夏意盎然的图片po上来,谢谢浏览~

爬虫百度图片并下载的程序——C#程序_第6张图片

你可能感兴趣的:(c#)