C# 实现抓取国家统计局行政区划数据爬虫

应为逻辑很简单直接上代码:

using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;

public  class Program
    {
        public static List html_As = new List();

        /// 
        /// 网络请求:请求方式为Get
        /// 
        ///  请求地址
        /// 返回结果
        public static string HttpGet(string Url)
        {
            try
            {
                Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
                request.Method = "GET";
                request.ContentType = "text/html;charset=gb2312";
                HttpWebResponse response = (HttpWebResponse)request.GetResponse();
                Stream myResponseStream = response.GetResponseStream();
                StreamReader myStreamReader = new StreamReader(myResponseStream, Encoding.GetEncoding("gb2312"));
                string retString = myStreamReader.ReadToEnd();
                myStreamReader.Close();
                myResponseStream.Close();
                return retString;
            }
            catch
            {
                Thread.Sleep(100);
               return  HttpGet(Url);
            }
           
        }
        static void Main(string[] args)
        {
             GetMsg("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/");
        }

        public static void GetMsg(string Url)
        {

            var shenarr = GetShen(HttpGet(Url + "index.html"));

            for (var shen_i=2;shen_i
        /// 省
        /// 
        /// 
        /// 
        public static List GetShen(string str)
        {
            List aArr = new List();
            //string regex = "href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
            string regex = "(.*?)";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            foreach (var a in matches)
            {
                string agx = "(.*?)";
                Regex are = new Regex(agx);
                MatchCollection mc_a = are.Matches(a.ToString());
                foreach (var aitem in mc_a)
                {
                    aArr.Add(GetA(aitem.ToString()));
                }
            }
            return aArr;
        }
        /// 
        /// 获取a标签
        /// 
        /// 
        /// 
        public static MatchCollection Get_A(string html)
        {
            string agx = "(.*?)";
            Regex are = new Regex(agx);
            MatchCollection mc_a = are.Matches(html);
            return mc_a;
        }

        /// 
        /// 市
        /// 
        /// 
        /// 
        public static List GetShi(string str)
        {
            string regex = "(.*?)";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);
            return GetHtmlaArr(re, matches);
        }
        /// 
        /// 区
        /// 
        /// 
        /// 
        public static List GetQu(string str)
        {
            string regex = "(.*?)";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }

        private static List GetHtmlaArr(Regex re, MatchCollection matches)
        {
            List aArr = new List();
            foreach (var ma in matches)
            {
                string rema = "(.*?)";
                Regex ma2 = new Regex(rema);
                MatchCollection matches2 = re.Matches(ma.ToString());
                foreach (var td in matches2)
                {
                    var a = Get_A(td.ToString());
                    if (a.Count == 2)
                    {
                        var ca0 = GetA(a[0].ToString());
                        var ca1 = GetA(a[1].ToString());
                        Html_a html_A = new Html_a();
                        html_A.code = ca0.name;
                        html_A.href = ca0.href;
                        html_A.name = ca1.name;
                        aArr.Add(html_A);
                    }
                    else
                    {
                        var msc = ma2.Matches(td.ToString());
                        if (msc.Count == 2)
                        {
                            Html_a html_A = new Html_a();
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(msc[0].ToString());
                            var ass = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.code = ass.InnerText;
                            htmlDoc.LoadHtml(msc[1].ToString());
                            var ass2 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.name = ass2.InnerText;
                            aArr.Add(html_A);
                        }

                        if (msc.Count == 3)
                        {
                            Html_a html_A = new Html_a();
                            var htmlDoc = new HtmlDocument();
                            htmlDoc.LoadHtml(msc[0].ToString());
                            var ass = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.code = ass.InnerText;
                            htmlDoc.LoadHtml(msc[1].ToString());
                            var ass2 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.cxtype = ass2.InnerText;

                            htmlDoc.LoadHtml(msc[2].ToString());
                            var ass3 = htmlDoc.DocumentNode.SelectSingleNode("//td");
                            html_A.name = ass3.InnerText;
                            aArr.Add(html_A);
                        }
                    }

                }
            }
            return aArr;
        }

        /// 
        /// 县 镇
        /// 
        /// 
        /// 
        public static List GetXian(string str)
        {
            string regex = "(.*?)";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }

        /// 
        /// 街道
        /// 
        /// 
        /// 
        public static List Getjiedao(string str)
        {
            string regex = "(.*?)";
            Regex re = new Regex(regex);
            MatchCollection matches = re.Matches(str);

            return GetHtmlaArr(re, matches);
        }
    }


    class Html_a
    {
        public string code { get; set; }
        public string href { get; set; }
        public string cxtype { get; set; }
        public string name { get; set; }

        public string sjcode { get; set; }
    }
}

项目链接可提供新手参考:ToMoveTheBick.rar-互联网文档类资源-CSDN下载

你可能感兴趣的:(c#,爬虫,后端)