爬虫遍历中图分类法

很多图书馆网站都有较为完整的网页版本《中图分类法》,不同的分类目录都位于不同的链接网页上

为此专门写了一个遍历该种网站的Java程序,我们以“中国图书馆分类法网站”为例


数据库创建表脚本:

CREATE TABLE [zhtclass] (
       [type] [nvarchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
       [detail] [nvarchar] (400) COLLATE Chinese_PRC_CI_AS NULL
) ON [PRIMARY]
GO

 


Java代码:

import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.Scanner;

public class Exec {
        public static void main(String args[]) {
                PreparedStatement pstm = null;

                // 下面五个参数可以自行根据网站网页格式调整
                String nullFlag = "没有下级分类";
                String firstFlagAlph = "

  • ";
                    String startFlagAlph = "";
                    String endFlagAlph = "
    ";
                    String startFlagName = "\">";
                    String endFlagName = "";

                    try {
                            Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                            Connection con = DriverManager
                                            .getConnection(
                                                            "jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=LIB",
                                                            "sa", "");
                            pstm = con.prepareStatement("insert into zhtclass values(?,?)");
                    } catch (Exception e) {
                            System.out.println(e.getLocalizedMessage());
                    }
                    for (int i = 1; i < 45837; i++) {
                            System.out.println(i);
                            try {
                                    StringBuffer sb = new StringBuffer();
                                    URL url = new URL("http://www.ztflh.com/?c=" + i);
                                    URLConnection urlConn = url.openConnection();
                                    urlConn.setReadTimeout(10000);
                                    urlConn.setConnectTimeout(10000);
                                    urlConn.setDoOutput(true);
                                    urlConn.connect();
                                    Scanner in = new Scanner(urlConn.getInputStream(),"UTF-8");
                                    for (int n = 1; in.hasNextLine(); n++)
                                            sb.append(in.nextLine());

                                    if (sb.indexOf(nullFlag) < 0) {
                                            int start = sb.indexOf(firstFlagAlph, 0)
                                                            + firstFlagAlph.length();
                                            int end = start;
                                            while (true) {
                                                    if (sb.indexOf(startFlagAlph, end) < 0)
                                                            break;
                                                    start = sb.indexOf(startFlagAlph, end)
                                                                    + startFlagAlph.length();
                                                    end = sb.indexOf(endFlagAlph, start);
                                                    String alph = sb.substring(start, end).trim();
                                                    start = sb.indexOf(startFlagName, end)
                                                                    + startFlagName.length();
                                                    end = sb.indexOf(endFlagName, start);
                                                    String name = sb.substring(start, end).trim();
                                                    pstm.setString(1, alph);
                                                    pstm.setString(2, name);
                                                    pstm.execute();
                                            }
                                    }
                                    url = null;
                            } catch (Exception e) {
                                    i--;
                                    System.out.println(e.getLocalizedMessage());
                            }
                            System.gc();
                    }
            }
    }

     

你可能感兴趣的:(图情研究,JavaEE,数据库,Java通用范例开发金典)