超简单版网络爬虫(定制)

本程序编写初衷只是想稍微帮下一朋友从某网页抓取数据资料,代码及逻辑都很简单(目标网页貌似很不怎样,各方面性能都较差...),可修改优化之处很多,仅供业余摆弄。
{
  问题来源: http://jdxx.zhs.mofcom.gov.cn/website/btgs.jsp
   实现步骤:
    1、浏览指定网页,使用者输入查询条件查询;
    2、点击 获取网页内容 按钮,进行
      2.1、跳到 第N页,保存其内容到 ListView;重复...
      2.2、保存 ListView 内容 到  Excel;
}
unit NetFetch;
interface
uses
  Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
  Dialogs, StdCtrls, Buttons, ExtCtrls, OleCtrls, SHDocVw, XPMan, ComCtrls;
type
  Tfrm_Main = class(TForm)
    XPManifest1: TXPManifest;
    wb_WebContent: TWebBrowser;
    Panel1: TPanel;
    Panel2: TPanel;
    Label1: TLabel;
    btn_GotoURL: TSpeedButton;
    edt_URL: TEdit;
    btn_Close: TButton;
    dlg_SaveInfo: TSaveDialog;
    Tmr_ClsDlg: TTimer;
    btn_GetNetInfo: TButton;
    lv_Info: TListView;
    lbl_Hints: TLabel;
    Tmr_GotoURL: TTimer;
    procedure FormShow(Sender: TObject);
    procedure FormResize(Sender: TObject);
    procedure edt_URLKeyDown(Sender: TObject; var Key: Word;
      Shift: TShiftState);
    procedure btn_GotoURLClick(Sender: TObject);
    procedure btn_GetNetInfoClick(Sender: TObject);
    procedure btn_CloseClick(Sender: TObject);
    procedure Tmr_GotoURLTimer(Sender: TObject);
    procedure Tmr_ClsDlgTimer(Sender: TObject);
  private
    { Private declarations }
    FTotalCount: string;                              // 总数据条数
    FTotalPageCount: string;                          // 总页数
  public
    { Public declarations }
    function GetDatasInfo: Boolean;                   // 得到数据条数、数据总页数
    function GotoNthPage(PageIndex: string): Boolean; // 转至第 PageIndex 页
    function SaveInfoToLV: Boolean;                   // 保存抓取的数据入 ListView
    function SaveLVToXLS(FileName: string): Boolean;  // 将 ListView 内容存入 Excel
  end;
var
  frm_Main: Tfrm_Main;
implementation
uses
  MSHtml, ComObj, ShellAPI;
{$R *.dfm}
procedure Tfrm_Main.FormShow(Sender: TObject);
begin
  Tmr_GotoURL.Enabled:= True;
  edt_URL.SetFocus;
end;
procedure Tfrm_Main.FormResize(Sender: TObject);
begin
  btn_GetNetInfo.Left:= (Width - 190) div 2;
  btn_Close.Left:= btn_GetNetInfo.Left + 130;
end;
procedure Tfrm_Main.edt_URLKeyDown(Sender: TObject; var Key: Word;
  Shift: TShiftState);
begin
  if Key = VK_RETURN then
  begin
    Key:= 0;
    wb_WebContent.Navigate(edt_URL.Text);
  end;
end;
procedure Tfrm_Main.btn_GotoURLClick(Sender: TObject);
begin
  wb_WebContent.Navigate(edt_URL.Text);
end;
procedure Tfrm_Main.btn_GetNetInfoClick(Sender: TObject);
var
  I: Integer;
  J: Integer;
begin
  if lv_Info.Items.Count > 0 then
  begin
    with lv_Info do
    begin
      Items.BeginUpdate;
      Items.Clear;
      Items.EndUpdate;
    end;
  end;
  btn_GetNetInfo.Enabled:= False;
  btn_Close.Enabled:= False;
  try
    if not GetDatasInfo then
      Exit;
    if FTotalPageCount = '1' then // 只 1页,直接保存退出即可
    begin
      SaveInfoToLV;
    end
    else
    begin                        // 多于 1页,先保存第 1 页内容,再...
      SaveInfoToLV;
      for I:= 2 to StrToInt(FTotalPageCount) do
      begin
        if not GotoNthPage(IntToStr(I)) then
        begin
          MessageBox(
                     Handle,
                     '网页跳转失败,程序无法继续抓取数据! ',
                     '错误',
                     MB_OK+ MB_ICONERROR
                    );
          Exit;
        end;
        lbl_Hints.Caption:= '正在获取第 ' + IntToStr(I) + '/' + FTotalPageCount + ' 页数据信息...';
        Update;
        Application.ProcessMessages;
        while wb_WebContent.Busy do
          Application.ProcessMessages;
        ShowMessage('转页...');        // 延时,可以优化...
        for J:= 0 to 20 do
          Application.ProcessMessages;
        ShowMessage('转页毕...');
        Application.ProcessMessages;
        Sleep(200);
        Application.ProcessMessages;
        while True do
        begin
          if SaveInfoToLV then
            Break
          else
          begin
            if not GotoNthPage(IntToStr(I)) then
            begin
              MessageBox(
                         Handle,
                         PChar('网页跳转失败(第 ' + IntToStr(I) + ' 页),程序无法继续抓取数据! '),
                         '错误',
                         MB_OK+ MB_ICONERROR
                        );
              Exit;
            end;
          end;
        end;
        Application.ProcessMessages;
      end;
    end;
  finally
    btn_GetNetInfo.Enabled:= True;
    btn_Close.Enabled:= True;
    lbl_Hints.Caption:= '';
    Update;
  end;
  if dlg_SaveInfo.Execute then
  begin
    if SaveLVToXLS(dlg_SaveInfo.FileName) then
    begin
      if MessageBox(
                    Handle,
                    PChar(
                          '所获取的网页数据信息已成功保存至 ' + dlg_SaveInfo.FileName +
                          ',是否现在查看?'
                         ),
                    '提示',
                    MB_YESNO + MB_ICONQUESTION
                   ) = IDYES then
       begin
         if ShellExecute(0, 'Open', PChar(dlg_SaveInfo.FileName), nil, nil, SW_SHOW) <= 32 then
           MessageBox(Handle, '打开文件失败! ', '提示', MB_OK + MB_ICONINFORMATION);
       end;
    end;
  end;
  with lv_Info do
  begin
    Items.BeginUpdate;
    Items.Clear;
    Items.EndUpdate;
  end;
end;
procedure Tfrm_Main.btn_CloseClick(Sender: TObject);
begin
  Close;
end;
procedure Tfrm_Main.Tmr_GotoURLTimer(Sender: TObject);
begin
  Tmr_GotoURL.Enabled:= False;
  Screen.Cursor:= crHourGlass;
  wb_WebContent.Navigate(edt_URL.Text);
  while wb_WebContent.Busy do
    Application.ProcessMessages;
  Screen.Cursor:= crDefault;
end;
procedure Tfrm_Main.Tmr_ClsDlgTimer(Sender: TObject);
var
  H: THandle;
begin
  H:= FindWindow('TMessageForm', PChar(Application.Title));
  if H > 0 then
  begin
    SendMessage(H, WM_KEYDOWN, VK_SPACE, 0);
    SendMessage(H, WM_CLOSE, 0, 0);
  end;
end;
function Tfrm_Main.GetDatasInfo: Boolean;
var
  ws: string;
begin
  Result:= False;
  ws:= (wb_WebContent.Document as IHTMLDocument2).Body.outerHTML;
  // 总数据条数
  Delete(ws, 1, Pos('<TD>共 ', ws) + 6);
  FTotalCount:= Copy(ws, 1, Pos(' 条信息', ws) - 1);
  if FTotalCount = '0' then
  begin
    MessageBox(Handle, '没有可供获取的网页数据信息! ', '提示', MB_OK + MB_ICONINFORMATION);
    Exit;
  end;

  // 总分页数
  Delete(ws, 1, Pos('页次:', ws) + 7);
  FTotalPageCount:= Copy(ws, 1, Pos('页 20篇', ws) - 1);
  if FTotalPageCount = '0' then
  begin
    MessageBox(Handle, '没有可供获取的网页数据信息!', '提示', MB_OK + MB_ICONINFORMATION);
    Exit;
  end;
  Result:= True;
end;
function Tfrm_Main.GotoNthPage(PageIndex: string): Boolean;
var
  Doc: IHTMLDocument2;
  Form: IHTMLFormElement;
  Elements: IHTMLElementCollection;
  InputElem: IHTMLInputElement;
  I: Integer;
begin
  Result:= False;
  Doc:= wb_WebContent.Document as IHTMLDocument2;
  Elements:= Doc.Forms as IHTMLElementCollection;
  Form:= Elements.Item(0, varEmpty) as IHTMLFormElement;
  Elements:= (Doc.All as IHTMLElementCollection).tags('input') as IHTMLElementCollection;
  for I:= 0 to Elements.Length - 1 do // 找到,并填充页码文本框
  begin
    InputElem:= Elements.Item(I, varEmpty) as IHTMLInputElement;
    if UpperCase(Trim(InputElem.Name)) = 'CPF.CPAGE' then
    begin
      InputElem.Value := PageIndex;
      Break;
    end;
  end;
  for I:= 0 to Elements.length - 1 do // 转到指定的页码
  begin
    InputElem:= Elements.Item(I, varEmpty) as IHTMLInputElement;
    if UpperCase(Trim(InputElem.type_)) = 'SUBMIT' then
    begin
      (InputElem as IHTMLInputButtonElement).Form.Submit;
      Break;
    end;
  end;
  while wb_WebContent.Busy do
    Application.ProcessMessages;
  for I:= 0 to 20 do
    Application.ProcessMessages;
  ShowMessage('提交网页请求...');
  for I:= 0 to 20 do
    Application.ProcessMessages;
  Result:= True;
end;
function  Tfrm_Main.SaveInfoToLV: Boolean;
var
  i: Integer;
  ws: string;
  ws2: string;
  Name: string;
  Product: string;
  Price: string;
  SubMoney: string;
  SubDate: string;
begin
  Result:= False;
  while wb_WebContent.Busy do
    Application.ProcessMessages;
  for I:= 0 to 20 do
    Application.ProcessMessages;
  ws:= (wb_WebContent.Document as IHTMLDocument2).Body.outerHTML;
  ws2:= Copy(ws, 1, Pos('<TD>共 ', ws) - 1);
  if ws2 = '' then // 转页发生异常
  begin
    Exit;
  end;
  ws:= (wb_WebContent.Document as IHTMLDocument2).Body.outerHTML;
  while Pos('<TD>共 ', ws) <> 1  do
  begin
    // 姓名
    Delete(ws, 1, Pos('<TD>', ws) + 3);
    Name:= Copy(ws, 1, Pos('</TD>', ws) - 1);
    // 产品
    Delete(ws, 1, Pos('<TD>', ws) + 3);
    Product:= Copy(ws, 1, Pos('</TD>', ws) - 1);
    // 价格
    Delete(ws, 1, Pos('<TD>', ws) + 3);
    Price:= Copy(ws, 1, Pos('</TD>', ws) - 1);
    // 补贴金额
    Delete(ws, 1, Pos('<TD>', ws) + 3);
    SubMoney:= Copy(ws, 1, Pos('</TD>', ws) - 1);
    // 补贴时间
    Delete(ws, 1, Pos('<TD align=middle>', ws) + 16);
    SubDate:= Copy(ws, 1, Pos('</TD>', ws) - 1);
    
    with lv_Info.Items.Add do
    begin
      Caption:= Name;
      SubItems.Add(Product);
      SubItems.Add(Price);
      SubItems.Add(SubMoney);
      SubItems.Add(SubDate);
    end;
    Delete(ws, 1, Pos('<TD>', ws) - 1);
    if Copy(ws, 1, 6) = '<TD>共' then
      Break;
  end;
  Result:= True;
end;
function Tfrm_Main.SaveLVToXLS(FileName: string): Boolean;
var
  FExcel: Variant;
  FWorkBook: Variant;
  FWorkSheet: Variant;
  FArray: Variant;
  s, z: Integer;
  RangeStr: string[12];
  Lis: TListItem;
  cells: string;
begin
  Result:= False;
  try
    FExcel:= CreateOleObject('Excel.Application');
  except
    Screen.Cursor:= crDefault;
    MessageBox(
               Handle, // or ZERO
               PChar('您的电脑尚未安装 MS Excel,无法保存数据信息! '),
               '错误',
               MB_OK + MB_ICONERROR
              );
    Exit;
  end;
  FWorkBook:= FExcel.WorkBooks.Add;
  FWorkSheet:= FWorkBook.WorkSheets.Add;
  FArray:= VarArrayCreate([0, lv_Info.Items.Count, 0, lv_Info.Columns.Count], varVariant);
  for z:= 0 to lv_Info.Columns.Count - 1 do //暂存 ListView 之列头
  begin
    cells:= lv_Info.Columns[z].Caption;
    FArray[0, z]:= cells;
  end;
  for s:= 1 to lv_Info.Items.Count do       //暂存 ListView 之内容
  begin
    Lis:= lv_Info.Items[s - 1];
    for z:= 0 to lv_Info.Columns.Count - 1 do
    begin
      if z = 0 then
        cells:= Lis.Caption
      else
      begin
        if z - 1 >= Lis.SubItems.Count then
          cells:= ''
        else
          cells:= Lis.SubItems[z - 1];
      end;
      FArray[s, z]:= cells;
    end;
  end;
  RangeStr:= 'A1:';
  if (lv_Info.Columns.Count > 26) then
  begin
    RangeStr:= RangeStr + Chr(Ord('A') - 1 + (lv_Info.Columns.Count div 26));
    RangeStr:= RangeStr + Chr(Ord('A') - 1 + (lv_Info.Columns.Count mod 26));
  end
  else
    RangeStr:= RangeStr + Chr(Ord('A') - 1 + lv_Info.Columns.Count);
  RangeStr:= RangeStr + IntToStr(lv_Info.Items.Count + 1);
  FWorkSheet.Range[RangeStr].Value:= FArray;
  FWorkbook.SaveAs(FileName);
  FExcel.Quit;
  FExcel:= unAssigned;
  Result:= True;
end;
end.
经测试,可正常抓取 万条、数百页 的数据。
附截图:




你可能感兴趣的:(职场,休闲)