C# 数据抓取

实现效果

抓取行政区划代码
地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2013/index.html

解决思路

分析页面数据规则,模拟请求获取页面内容,正则过滤,数据入库,递归

代码示例

建表、添加、删除

    public void CreateDB(string tableName)
    {
        string strSql = "IF  EXISTS (SELECT * FROM sys.objects WHERE object_id = OBJECT_ID(N'dbo." + tableName
            + "') AND type in (N'U')) DROP TABLE dbo." + tableName;
        string strC = " CREATE TABLE dbo." + tableName + "([id] [varchar](50) NULL,[type] [varchar](50) NULL,[name] [nvarchar](50) NULL,[pid] [varchar](50) NULL,[url] [varchar](100) NULL) ON [PRIMARY]";
        SQLHelper sqlh = new SQLHelper();
        sqlh.ExecuteSQLNonQuery(strSql);
        sqlh.ExecuteSQLNonQuery(strC);
    }

    public void InsertDB(Region r, string tableName)
    {
        string strSql = "INSERT INTO " + tableName + " ([id],[type],[name],[pid],[url]) VALUES('" + r.id
            + "','" + r.type + "','" + r.name + "','" + r.pid + "','" + r.url + "')";
        SQLHelper sqlh = new SQLHelper();
        sqlh.ExecuteSQLNonQuery(strSql);
    }

    public void DeleteDB()
    {
        string strSql = "delete from  [GetRegion] ";
        SQLHelper sqlh = new SQLHelper();
        sqlh.ExecuteSQLNonQuery(strSql);

    }

数据抓取

    /// 
    /// 模拟请求返回数据
    /// 
    /// 网页地址
    /// POST、GET
    /// UTF-8、gb2312
    /// 网页内容
    public string Send(string strUrl, string strType, string strEncoding)
    {
        HttpWebRequest httpReq = (HttpWebRequest)HttpWebRequest.Create(strUrl); ////创建request请求
        if (httpReq == null)
        {
            throw new ApplicationException(string.Format("Invalid url string: {0}", strUrl));
        }
        httpReq.Method = strType;
        httpReq.Timeout = 1000 * 30;
        //设置请求方式
        HttpWebResponse httpRes = (HttpWebResponse)httpReq.GetResponse();                       ////返回response数据
        Stream myRequestStream = httpRes.GetResponseStream();                                   ////取得内容
        StreamReader myStreamRead = new StreamReader(myRequestStream, Encoding.GetEncoding(strEncoding));        ////读取流

        string strdata = string.Empty;
        strdata = myStreamRead.ReadToEnd();

        myStreamRead.Close();
        myRequestStream.Close();
        return strdata;

    }

递归

    public void GetRegion(string strUrl, string strPid, string tableName)
    {
        string strHtml = Send(strUrl, "GET", "gb2312");
        string strTable = AnalyzeHtml(strHtml);

        //街道
        if (strHtml.Contains("villagetable"))
        {

            Regex reg = new Regex(@"(?.*?)");
            MatchCollection mc = reg.Matches(strTable);

            for (int i = 0; i < (mc.Count - 1) / 3; i++)
            {
                Region r = new Region();
                r.id = mc[i * 3 + 1].ToString().Replace("", "").Replace("", "");
                r.type = mc[i * 3 + 2].ToString().Replace("", "").Replace("", "");
                r.pid = strPid;
                r.name = mc[i * 3 + 3].ToString().Replace("", "").Replace("", "");
                //listR.Add(r);
                InsertDB(r, tableName);
            }

        }
        else
        {
            Regex reg = new Regex(@"(?is)]*?href=(['""]?)(?[^'""\s>]+)\1[^>]*>(?(?:(?!");
            MatchCollection mc = reg.Matches(strTable);
            //省、直辖市
            if (strHtml.Contains("provincetable"))
            {
                for (int i = 0; i < mc.Count; i++)
                {
                    Region r = new Region();
                    r.id = "00" + (i + 1).ToString();
                    r.type = "";
                    r.pid = strPid;
                    r.name = mc[i].Groups["text"].Value.Replace("
", ""); r.url = strUrl.Remove(strUrl.LastIndexOf("/") + 1, strUrl.Length - strUrl.LastIndexOf("/") - 1) + mc[i].Groups["url"].Value; //listR.Add(r); InsertDB(r, tableName); GetRegion(r.url, r.id, tableName); } } else { for (int i = 0; i < mc.Count / 2; i++) { Region r = new Region(); r.id = mc[i * 2].Groups["text"].Value.Replace("
", ""); r.type = ""; r.pid = strPid; r.name = mc[i * 2 + 1].Groups["text"].Value.Replace("
", ""); r.url = strUrl.Remove(strUrl.LastIndexOf("/") + 1, strUrl.Length - strUrl.LastIndexOf("/") - 1) + mc[i * 2].Groups["url"].Value; //listR.Add(r); InsertDB(r, tableName); GetRegion(r.url, r.id, tableName); } } } }

遇到问题及解决思路

数据量大

数据分省份保存入库,提供多种处理方式,数据量少入库同时导出,数据量大先入库,再分多工作薄导出。如无法导出,使用直接拷贝粘贴,或者使用数据库标准导出。

检查完整性

自动检查,加上手工抽查,记录获取数据条目,对比数据库记录数。检查省份最后乡镇街道数据,随机抽查省份数据。

操作界面

C# 数据抓取_第1张图片
操作界面

你可能感兴趣的:(C# 数据抓取)