如何抓取WebClient、HttpWebRequest、WebRequest无法获取的网页源码,下面将为你解答...

//由于我所抓取的网页有多个困难,1、以上三种无法获取到源码,无论设置何种头部请求都没用,2、单独只用webbrowser无法进行翻页操作。所以用webbrowser与IE结合来抓取
//本项目流程——先打开IE、再用MSHtml里的方法操作IE表单,进行翻页,将列表中的网址在webbrowser一一打开,这样才获取得到源码。
这个项目的意义在于,无论网站是何种方式加载,都可以抓取到内容。
项目源码在文章最后。

  string[] province = { "北京市", "天津市", "河北省", "山西省", "内蒙古", "辽宁省", "吉林省", "黑龙江省", "上海市", "江苏省", "浙江省", "安徽省", "福建省", "江西省", "山东省", "河南省", "湖北省", "湖南省", "广东省", "广西壮族", "海南省", "重庆市", "四川省", "贵州省", "云南省", "西藏", "陕西省", "甘肃省", "青海省", "宁夏回族", "新疆维吾尔", "新疆建设兵团" };
    int[] provinceCode = { 11, 12, 13, 14, 15, 21, 22, 23, 31, 32, 33, 34, 35, 36, 37, 41, 42, 43, 44, 45, 45, 50, 51, 52, 53, 54, 61, 62, 63, 64, 65, 66 };

          //private Thread Thread_land;
    private void button1_Click(object sender, EventArgs e)
    {
        //调用webBrowser中的js函数
        //if (webBrowser1.Document != null)
        //{
        //    HtmlDocument doc = webBrowser1.Document;
        //    //无参调用
        //    doc.InvokeScript("sdf2");
        //}
        new Action(appStart).BeginInvoke(null, null);
        //Thread Thread_land = new Thread(new ThreadStart(appStart));
        //Thread_land.Start();
        //appStart();
        

    }
    string TaskProgress = "";//任务进度
    private void appStart()
    {
        
        
        try
        {
            //Action at = new Action(delegate() { label1.Text += "采集开始;\n"; });
            Action at = new Action(() => { textBox1.Text = "采集开始;\r\n"; labCount.Text = "0"; timer1.Enabled = true; labStartTime.Text =DateTime.Now.ToString(); });
            this.Invoke(at);

            //获取采集进度
            int proIndex = 0;
            int yearIndex = 2011;
            int pageIndex = 2;
            LandDB.BLL.CJLog cjlogBLL = new LandDB.BLL.CJLog();
            TaskProgress = cjlogBLL.GetTaskProgress();

            if (TaskProgress != "")
            {
                string[] s = TaskProgress.Split(',');
                ArrayList str = new ArrayList(province);
                proIndex = str.IndexOf(s[0]);
                yearIndex = int.Parse(s[2]);
                pageIndex = int.Parse(s[3]);
            }

            //遍历所有选项卡
            SHDocVw.ShellWindows IETabs = new SHDocVw.ShellWindows();
            foreach (SHDocVw.InternetExplorer ieTab in IETabs)
            {
                if (ieTab.LocationURL.Contains("www.landchina.com/default.aspx"))
                {
                    for (int i = proIndex; i <= province.Length; i++)//遍历省
                    {
                        for (int y = yearIndex; y <= DateTime.Now.Year; y++)//遍历年
                        {
                            //通过js操控
                            mshtml.HTMLDocument doc = ieTab.Document as mshtml.HTMLDocument;
                            mshtml.IHTMLScriptElement script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
                            script.text = string.Format("document.getElementById('TAB_queryTblEnumItem_227').value='{0}';", province[i]);
                            script.text += string.Format("document.getElementById('TAB_queryTblEnumItem_227_v').value={0};", provinceCode[i]);
                            script.text += "document.getElementById('TAB_QueryConditionItem227').checked = true;";
                            script.text += "document.getElementById('TAB_QueryConditionItem268').checked = true;";

                            DateTime dtbegin = new DateTime(y, 1, 1); ;
                            DateTime dtend = new DateTime(y, 12, 31);
                            script.text += string.Format("document.getElementById('TAB_queryDateItem_268_1').value='{0}';", dtbegin);//开始时间
                            script.text += string.Format("document.getElementById('TAB_queryDateItem_268_2').value='{0}';", dtend);//结束时间
                            //script.text += "document.getElementById('TAB_QueryConditionItem288').checked = true;";
                            //script.text += string.Format("document.getElementById('TAB_queryCheckItem_288').value='{0}';","");//土地用途
                            
                            mshtml.HTMLBody body = doc.body as mshtml.HTMLBody; //取得body对象
                            body.appendChild((mshtml.IHTMLDOMNode)script);//注册JavaScript

                            mshtml.IHTMLDocument2 doc2 = (mshtml.IHTMLDocument2)ieTab.Document;
                            mshtml.IHTMLElementCollection inputs;
                            inputs = (mshtml.IHTMLElementCollection)doc2.all.tags("INPUT");
                            mshtml.IHTMLElement element_post = (mshtml.IHTMLElement)inputs.item("TAB_QueryButtonControl", 0);
                            element_post.click();
                            
                            doc = ieTab.Document as mshtml.HTMLDocument; analysisSource(doc.body.innerHTML);
                            Regex re = new Regex("共([0-9]{1,5})页([\\s\\ ]*共[0-9]{1,20})条记录", RegexOptions.Multiline);
                            Match ma = re.Match(doc.body.innerHTML);

                            string pages = ma.Groups[1].ToString();

                            for (int j = pageIndex; j <= int.Parse(pages); j++)
                            {
                                TaskProgress = province[i] + "," + provinceCode[i] + "," + dtbegin+","+j;
                                script = doc.createElement("script") as mshtml.IHTMLScriptElement;//
                                script.text = string.Format("QueryAction.GoPage('TAB',{0})", j);
                                body = doc.body as mshtml.HTMLBody; //取得body对象
                                body.appendChild((mshtml.IHTMLDOMNode)script);//注册JavaScript
                                //分析页面
                                analysisSource(doc.body.innerHTML);
                                Action at1 = new Action(() => { textBox1.Text= ""; });
                                this.Invoke(at1);
                            }
                            pageIndex = 2;
                        }
                        yearIndex = 2011;

                    }

                }
                else
                {
                    //Action at = new Action(() => { "IE浏览器可能未打开\n"; });
                    //this.Invoke(at);
                }

            }
            
        }
        catch(Exception ex)
        {
            Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
            this.Invoke(at);

            //把错误记录到数据库

            LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
            LandDB.Model.CJLog model = new LandDB.Model.CJLog();
            model.FSourceContent = "appStart:" + ex.Message.ToString();
            model.Furl = webBrowser1.Document.Url.ToString();
            bll.Add(model);
        }
        finally
        {
            Action at = new Action(() => { textBox1.Text += "采集停止;\r\n"; timer1.Enabled = false; });
            this.Invoke(at);

            //把进度记录到数据库
            LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
            LandDB.Model.CJLog model = new LandDB.Model.CJLog();
            model.TaskProgress = TaskProgress;
            model.FSourceContent = "appStart:";
            model.Furl = webBrowser1.Document.Url.ToString();
            bll.Add(model);

           this.EndInvoke(null);
        }

    }

    /// 
    /// 获取列表
    /// 
    /// 
    private void analysisSource(string source)
    {
        //去除回车换行符号
        source = Regex.Replace(source, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        source = Regex.Replace(source, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
        source.Replace("\\r\\n", "");

        Regex reg = new Regex("href]+)href=\"/DesktopModule/BizframeExtendMdl/workList/bulWorkView.aspx?([^\"]+)\"([^\\/]*)\\/?>",RegexOptions.Multiline|RegexOptions.IgnoreCase);
        MatchCollection mats = reg.Matches(source);
        foreach (Match mat in mats)
        {
            string aa = mat.Value;
            Regex reg1 = new Regex("href=\"([^\"]+)\"");
            string url = "http://www.landchina.com" + reg1.Match(aa).Groups[1].Value.Replace("&", "&");
            Action at = new Action(() =>
            {
                webBrowser1.Navigate(url);
                textBox1.Text += url + "\r\n";
            });
            this.Invoke(at);
            Thread.Sleep(3000);
        }

    }

    private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
    {
        try
        {
            string webInfo = webBrowser1.Document.Body.InnerHtml;
            if (webInfo != "" && webInfo.IndexOf("占地公告") == -1)
            {
                //去除回车换行符号
                webInfo = Regex.Replace(webInfo, "([\\r\\n])[\\s]+", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                webInfo = Regex.Replace(webInfo, "\\n", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
                webInfo.Replace("\\r\\n", "");
                webInfo = CleanWordHtml(webInfo);
                string Splitstr = @"().*?(|class=[""']?link1[""']? id=[""']?lnkOldBul[""']?))";
                Match cc = Regex.Match(webInfo, Splitstr, RegexOptions.IgnoreCase | RegexOptions.Multiline);
                webInfo = cc.Groups[0].ToString();
                webInfo = webInfo.ToLower();

                if (webInfo != "")
                {
                    getParam(webInfo, webBrowser1.Document.Url.ToString());
                }
                textBox1.Text += "采集完成\r\n";
                labCount.Text = (int.Parse(labCount.Text) + 1).ToString();
            }
            else
            {
                textBox1.Text += "空页面\r\n";
                //把错误记录到数据库
                LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
                LandDB.Model.CJLog model = new LandDB.Model.CJLog();
                model.Furl = webBrowser1.Document.Url.ToString();
                bll.Add(model);

            }
        }
        catch (Exception ex)
        {
            Action at = new Action(() => { textBox1.Text += ex.Message.ToString() + "\r\n"; });
            this.Invoke(at);

            //把错误记录到数据库

            LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
            LandDB.Model.CJLog model = new LandDB.Model.CJLog();
            model.FSourceContent = ex.Message.ToString();
            model.Furl = webBrowser1.Document.Url.ToString();
            bll.Add(model);
        }
        //finally
        //{                
        //    //把进度记录到数据库
        //    LandDB.BLL.CJLog bll = new LandDB.BLL.CJLog();
        //    LandDB.Model.CJLog model = new LandDB.Model.CJLog();
        //    model.TaskProgress = TaskProgress;
        //    model.Furl = webBrowser1.Document.Url.ToString();
        //    bll.Add(model);

        //}
    }
    private void Main_FormClosed(object sender, FormClosedEventArgs e)
    {

    }

    /// 
    /// 获取参数
    /// 
    /// 
    /// 
    private void getParam(string strWebData, string url)
    {
        LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote();
        LandDB.Model.blockNote bnModel = new LandDB.Model.blockNote();

        bnModel.gtUrl = url;
        bnModel.dataType = 1;

        string Splitstr = "]*>([^<]*)";
        Match ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
        string strTitle = ma.Groups[1].ToString();
        bnModel.topic = strTitle;

        Splitstr = "[(].*?[)]";
        ma = Regex.Match(strTitle, Splitstr);

        string noteno = ma.Groups[0].ToString().Replace("(", "").Replace(")", "");
        bnModel.noteNo = noteno;

        //发布单位
        Splitstr = @"](?:align=[""']?right[""']?)*>([^<]*)
"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); string strpubUnit = ma.Groups[1].Value.Trim(); if (string.IsNullOrEmpty(strpubUnit)) { strpubUnit = string.Empty; } bnModel.pubUnit = strpubUnit; bool isEx = false; isEx = bnBll.Exists(string.Format("topic='{0}' and noteNo='{1}' and state=1 ", strTitle, noteno)); if (isEx == true) { bnModel = bnBll.GetModel(strTitle, noteno); //1、采集完成 2、采集不成功,3、地块不完全 if (bnModel.gtState == 2) { //重新采集更新 //插入公告 AddblockNote(strWebData, ref bnModel); bnBll.Update(bnModel); bool isComp = false; AddblockInfo(strWebData, bnModel, out isComp); //判断表是否采集完整 if (isComp == true) { bnBll.Update("gtstate=3", bnModel.noteId); } } else if(bnModel.gtState==3) { bool isComp = false; AddblockInfo(strWebData, bnModel, out isComp); //判断表是否采集完整 if (isComp == true) { bnBll.Update("gtstate=3", bnModel.noteId); } } } if (isEx == false) { //插入公告 AddblockNote(strWebData, ref bnModel); int noteid = bnBll.Add(bnModel); bnModel.noteId = noteid; //继续往下插入表 bool isComp=false; AddblockInfo(strWebData, bnModel,out isComp); //判断表是否采集完整 if(isComp==true) { bnBll.Update("gtstate=3",noteid); } } } private void AddblockNote(string strWebData, ref LandDB.Model.blockNote bnModel) { LandDB.BLL.blockNote bnBll = new LandDB.BLL.blockNote(); //if (bnBll.Exists(bnModel.noteId)) //{ //} //else //{ string Splitstr = ""; string province = ""; string city = ""; string blockZone = ""; Match ma = null; //地区 Splitstr = @"](?:id=[""']?lblXzq[""']?)*>([^<]*)"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); string partweb = ma.Groups[1].ToString(); if (partweb != "") { partweb = partweb.Replace(">", "|"); string[] s = partweb.Split('|'); if (s.Length > 0) { s[0] = s[0].Replace("行政区:", "").Trim(); province = s[0]; //model.province = s[0].Trim().TrimEnd('省').Replace("自治区", "").Replace("直辖市", "").Replace("自治州", ""); if (s.Length >= 2) { //s[1] = s[1].ToString().Trim().TrimEnd('市'); //model.city = s[1].Replace("自治区", "").Replace("直辖市", "").Replace("自治州", "").Replace("自治县", ""); city = s[1]; if (s.Length == 3) { //s[2] = s[2].ToString().Trim().Replace("自治区", "").Replace("自治州", "").Replace("自治县", "").Replace("本级", ""); //model.blockZone = s[2]; blockZone = s[2]; } else { //model.blockZone = ""; blockZone = ""; } } } } bnModel.province = province; bnModel.city = city; bnModel.blockZone = blockZone; //出让日期 Splitstr = "(?:号地块:)(.*?日)"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[1].Value; if (partweb == "")//不存在,则在另一个地方获取 { Splitstr = "(六、).*?[年]?()"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[0].ToString(); Splitstr = ".*?日"; ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("", "").Replace("", "").Trim(); bnModel.transferDate = Convert.ToDateTime(partweb); } else { partweb = partweb.Replace("", "").Replace("", "").Trim(); bnModel.transferDate = DateTime.Parse(partweb); } //出让方式 Splitstr = "(?:以 )(.*?)(?: 方式出让 )"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[1].ToString(); if (partweb != "") { bnModel.remiseWay = partweb; } //创建时间 bnModel.createTime = DateTime.Parse(DateTime.Now.ToShortDateString().ToString()); //截止日期 Splitstr = "(?:号地块:).*?(?: ;)"; ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[0].ToString(); if (partweb != "") { Splitstr = "(至 ).*?(日)"; ma = Regex.Match(partweb, Splitstr, RegexOptions.IgnoreCase); partweb = ma.Groups[0].ToString().Replace("至 ", "").Replace("", ""); partweb = Regex.Replace(partweb, "[\u4e00-\u9fa5]", "-").Trim('-'); bnModel.expireDate = DateTime.Parse(partweb); } else//拍卖 { Splitstr = "(截止时间为).*?[年]?(日)"; ma = Regex.Match(strWebData, Splitstr); partweb = ma.Groups[0].ToString().Replace("年", "-").Replace("月", "-").Replace("日", "").Replace("截止时间为", "").Replace("", "").Replace("", "").Trim(); bnModel.expireDate = DateTime.Parse(partweb); }
            //发布日期
            Splitstr = @"](id=[""']?lblCreateDate[""']?)*>([^<]*)";
            ma = Regex.Match(strWebData, Splitstr, RegexOptions.IgnoreCase);
            partweb = ma.Groups[0].ToString().Trim();
            if (partweb != "")
            {
                partweb = Regex.Match(partweb, "\\d{4}年\\d{1,2}月\\d{1,2}日").Value;
                bnModel.pubDate = DateTime.Parse(partweb);
            }

            if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.topic) || bnModel.pubDate == null || string.IsNullOrEmpty(bnModel.remiseWay) || string.IsNullOrEmpty(bnModel.pubUnit) || bnModel.expireDate == null || bnModel.transferDate == null || string.IsNullOrEmpty(bnModel.blockZone))
            {
                bnModel.gtState = 2;
            }
            else
            {
                bnModel.gtState = 1;
            }
            
        //}

    }
    private void AddblockInfo(string strWebData, LandDB.Model.blockNote bnModel, out bool isComp)
    {
        strWebData = strWebData.Replace(" ", "");
        LandDB.BLL.blockInfo bIBll = new LandDB.BLL.blockInfo();

        #region
        //获取地块
        Regex divRg = new Regex(@".*?
", RegexOptions.Multiline | RegexOptions.IgnoreCase); MatchCollection divRgs = divRg.Matches(strWebData); foreach (Match match in divRgs) { LandDB.Model.blockInfo bIModel = new LandDB.Model.blockInfo(); string blockTable = match.Value; if (blockTable != "") { bIModel.noteId = bnModel.noteId; bIModel.province = bnModel.province; bIModel.city = bnModel.city; bIModel.blockZone = bnModel.blockZone; bIModel.transferMode = bnModel.remiseWay; bIModel.dataType = 1; bIModel.blockState = 2; bIModel.state = 1; bIModel.createTime = DateTime.Now; //出让单位 bIModel.pubUnit = bnModel.pubUnit; //起始总价 bIModel.firstPrice = 0; bIModel.donePrice = 0; bIModel.doneArea = 0; bIModel.floorPrice = 0; bIModel.blockArea = ""; #region Regex re = new Regex("(]?()", RegexOptions.IgnoreCase); MatchCollection mc = re.Matches(blockTable); for (int i = 0; i < mc.Count; i++) { //var ed = Regex.Match(mc[i].Value, "]*>([^<]*)"); var ed = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); if (!string.IsNullOrEmpty(ed)) { //if(ed.IndexOf("用途")!=-1) //{ //} if (ed == "宗地编号:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); bIModel.blockNo = tdvalue; continue; } else if (ed == "宗地面积:" || ed == "宗地总面积:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); bIModel.blockArea = tdvalue; continue; } else if (ed == "起始价:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); var first = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?"); if (first.Success) { bIModel.firstPrice = decimal.Parse(first.ToString()); } continue; } else if (ed == "容积率:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); string blockRate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value; bIModel.blockRate = blockRate; bIModel.blockRateStr = tdvalue; continue; } else if (ed == "土地用途:" || ed == "土地用途明细:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); //string useType = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value; bIModel.oriUseType = tdvalue; continue; } else if (ed == "宗地坐落:") { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); //string blockAddress = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value; bIModel.blockAddress = tdvalue; continue; } else if (ed.IndexOf("挂牌截止时间")!=-1) { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); //string expireDate = Regex.Match(tdvalue, @"^(\-|\+)?[\d,]+(\.\d+)?").Value; bIModel.expireDate = DateTime.Parse(tdvalue); continue; } else if(ed.IndexOf("估价报告备案号")!=-1) { i++; var tdvalue = Regex.Replace(mc[i].Value, @"<[^>]+>", ""); bIModel.RecordNumberOfAppraisalReport = tdvalue; continue; } } } #endregion bool isEx = false; isEx = bIBll.Exists(string.Format("blockAddress='{0}' and blockNo='{1}' and state=1 ", bIModel.blockAddress, bIModel.blockNo)); if (string.IsNullOrEmpty(bnModel.province) || string.IsNullOrEmpty(bnModel.city) || string.IsNullOrEmpty(bnModel.blockZone) || string.IsNullOrEmpty(bIModel.blockArea) || string.IsNullOrEmpty(bIModel.oriUseType) || bIModel.expireDate == null || string.IsNullOrEmpty(bIModel.pubUnit) || string.IsNullOrEmpty(bIModel.blockAddress) || string.IsNullOrEmpty(bIModel.blockNo) || bIModel.firstPrice == null || string.IsNullOrEmpty(bIModel.blockRateStr) || string.IsNullOrEmpty(bIModel.RecordNumberOfAppraisalReport)) { bIModel.gtState = 2; } else { bIModel.gtState = 1; } if (isEx == true) { bIModel = bIBll.GetModel(bIModel.blockAddress, bIModel.blockNo); //1、采集完成 2、采集不成功 if (bIModel.gtState == 2) { //插入地块 bIBll.Update(bIModel); } } if (isEx == false) { int infoid = bIBll.Add(bIModel); } } } isComp= bIBll.Exists("gtState=2"); #endregion } 项目源码地址:http://www.onethink.top/1/SoudiWinForm.zip

你可能感兴趣的:(如何抓取WebClient、HttpWebRequest、WebRequest无法获取的网页源码,下面将为你解答...)