爬虫遍历中图分类法

很多图书馆网站都有较为完整的网页版本《中图分类法》,不同的分类目录都位于不同的链接网页上

为此专门写了一个遍历该种网站的Java程序,我们以“中国图书馆分类法网站”为例


数据库创建表脚本:

CREATE TABLE [zhtclass] (
       [type] [nvarchar] (100) COLLATE Chinese_PRC_CI_AS NULL ,
       [detail] [nvarchar] (400) COLLATE Chinese_PRC_CI_AS NULL
) ON [PRIMARY]
GO

 


Java代码:

import java.net.URL;
import java.net.URLConnection;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.util.Scanner;

public class Exec {
        public static void main(String args[]) {
                PreparedStatement pstm = null;

                // 下面五个参数可以自行根据网站网页格式调整
                String nullFlag = "没有下级分类";
                String firstFlagAlph = "<ul id=\"list\" class=\"cent\" style=\"list-style:none;\"><li>";
                String startFlagAlph = "<span class=\"code\">";
                String endFlagAlph = "</span>";
                String startFlagName = "\">";
                String endFlagName = "</a>";

                try {
                        Class.forName("com.microsoft.jdbc.sqlserver.SQLServerDriver");
                        Connection con = DriverManager
                                        .getConnection(
                                                        "jdbc:microsoft:sqlserver://localhost:1433;DatabaseName=LIB",
                                                        "sa", "");
                        pstm = con.prepareStatement("insert into zhtclass values(?,?)");
                } catch (Exception e) {
                        System.out.println(e.getLocalizedMessage());
                }
                for (int i = 1; i < 45837; i++) {
                        System.out.println(i);
                        try {
                                StringBuffer sb = new StringBuffer();
                                URL url = new URL("http://www.ztflh.com/?c=" + i);
                                URLConnection urlConn = url.openConnection();
                                urlConn.setReadTimeout(10000);
                                urlConn.setConnectTimeout(10000);
                                urlConn.setDoOutput(true);
                                urlConn.connect();
                                Scanner in = new Scanner(urlConn.getInputStream(),"UTF-8");
                                for (int n = 1; in.hasNextLine(); n++)
                                        sb.append(in.nextLine());

                                if (sb.indexOf(nullFlag) < 0) {
                                        int start = sb.indexOf(firstFlagAlph, 0)
                                                        + firstFlagAlph.length();
                                        int end = start;
                                        while (true) {
                                                if (sb.indexOf(startFlagAlph, end) < 0)
                                                        break;
                                                start = sb.indexOf(startFlagAlph, end)
                                                                + startFlagAlph.length();
                                                end = sb.indexOf(endFlagAlph, start);
                                                String alph = sb.substring(start, end).trim();
                                                start = sb.indexOf(startFlagName, end)
                                                                + startFlagName.length();
                                                end = sb.indexOf(endFlagName, start);
                                                String name = sb.substring(start, end).trim();
                                                pstm.setString(1, alph);
                                                pstm.setString(2, name);
                                                pstm.execute();
                                        }
                                }
                                url = null;
                        } catch (Exception e) {
                                i--;
                                System.out.println(e.getLocalizedMessage());
                        }
                        System.gc();
                }
        }
}

 

你可能感兴趣的:(爬虫,中图分类法)