爬虫(Java实现)

//Robot.java
1
package robot; 2 import java.net.*; 3 import java.sql.SQLException; 4 import java.util.Random; 5 6 import javax.swing.JOptionPane; 7 8 import org.htmlparser.*; 9 import org.htmlparser.filters.TagNameFilter; 10 import org.htmlparser.tags.LinkTag; 11 import org.htmlparser.util.NodeList; 12 import org.htmlparser.util.ParserException; 13 14 import mydb.DB; 15 public class Robot { 16 int ff=0; 17 18 int num=0; 19 //DB db; 20 Robot() throws MalformedURLException, SQLException{ 21 DB.getConnect("localhost","3306","robot","root","142365"); 22 DB.getSta(); 23 String url0="http://www.youku.com";//"http://localhost"; 24 /* DB.rs= DB.s.executeQuery("select count(*) from urls"); 25 if(DB.rs.next()) 26 {int n=DB.rs.getInt(1); 27 System.out.println(n ); 28 Random random = new Random(); 29 30 int ran = random.nextInt(); 31 ran%=n; 32 ran=ran>0?ran:-ran; 33 System.out.println(ran ); 34 DB.rs= DB.s.executeQuery("select * from urls"); 35 int x=0; 36 while(DB.rs.next()&&x<ran){ 37 38 System.out.println(DB.rs.getString(1)+"000" ); url0=DB.rs.getString(1); 39 40 x++; 41 } 42 43 }*/ 44 //catchHref("http://localhost",num); 45 catchHref(url0,num); 46 } 47 boolean isEndLegal(String str){ 48 if(str.endsWith("php")||str.endsWith("net/")||str.endsWith("com/")||str.endsWith("cn/")||str.endsWith("gov/")||str.endsWith("edu/")||str.endsWith("org/")||str.endsWith("net")||str.endsWith("com")||str.endsWith("cn")||str.endsWith("gov")||str.endsWith("edu")||str.endsWith("org")){ 49 return true; 50 } 51 return false; 52 } 53 boolean catchHref(String hreft ,int num) throws MalformedURLException { 54 Parser parser =null; 55 NodeList nodelist=null; 56 String href = "http://www.baidu.com"; 57 //db=new DB(); 58 if(ff!=0) 59 if (!(hreft.startsWith("http")&&isEndLegal(hreft)&&!isInDatabase(hreft))) { 60 return false; 61 } 62 ff=1; 63 add(hreft); 64 65 System.out.println(num); 66 try { 67 parser = new Parser(hreft); 68 if(parser==null)return false; 69 } catch (ParserException e) { 70 return false; 71 //e.printStackTrace(); 72 } 73 try { 74 nodelist = parser.parse(null); 75 } catch (ParserException e1) { 76 e1.printStackTrace(); 77 } 78 if(nodelist==null)return false; 79 NodeFilter filter = new TagNameFilter("A"); 80 if(filter==null)return false; 81 nodelist = nodelist.extractAllNodesThatMatch(filter, true); 82 if(nodelist==null)return false; 83 for (int i = 0; i < nodelist.size(); i++) { 84 LinkTag link = (LinkTag) nodelist.elementAt(i); 85 href = link.getAttribute("href"); 86 if(href==null)return false; 87 System.out.println(href ); 88 catchHref(href,num); 89 90 } 91 num++; 92 return true; 93 } 94 void add(String str){ 95 96 try { 97 DB.s.execute("insert into urls2(url)values('"+str+"');"); 98 DB.commit(); 99 System.out.println("add"); 100 } catch (SQLException e) { 101 //e.printStackTrace();return ; 102 //JOptionPane.showMessageDialog(null, "数据库添加失败"); 103 //System.exit(-1); 104 105 } 106 107 return ; 108 } 109 boolean isInDatabase(String str){ 110 111 try { 112 DB.rs= DB.s.executeQuery("select * from urls where url like'"+str+"%';"); 113 if(DB.rs.next()){System.out.println(DB.rs);return true;} 114 } catch (SQLException e) { 115 e.printStackTrace(); 116 JOptionPane.showMessageDialog(null, "数据库查找失败"); 117 System.exit(-1); 118 } 119 return false; 120 } 121 public static void main(String[] args) 122 throws MalformedURLException, ParserException, SQLException { 123 Robot robot = new Robot(); 124 } 125 }

 

//DB.java
1
package mydb; 2 import java.sql.*; 3 import java.util.ArrayList; 4 5 import javax.swing.*; 6 //import com.mysql.jdbc.Driver; 7 public class DB { 8 9 public static Connection conn = null; 10 public static ResultSet rs = null; 11 public static Statement s = null; 12 public DB() { 13 conn = null; 14 s = null; 15 rs=null; 16 17 } 18 /* 19 String getResult(ResultSet rs) { 20 String str = "Book\t\tOwnerID\tOwnerName\n"; 21 // System.out.println("\nno\tname\tsex\tsalary"); 22 try { 23 while (rs.next()) { 24 StringBuilder builder = new StringBuilder(rs.getString(1)); 25 builder.append("\t\t"); 26 builder.append(rs.getString(2)); 27 builder.append("\t"); 28 builder.append(rs.getString(3)); 29 builder.append("\n"); 30 str += builder.toString(); 31 } 32 } catch (Throwable e) { 33 34 } 35 // System.out.println(); 36 return str; 37 }*/ 38 39 public static Connection getConnect(String IP,String port,String database,String user,String password){ 40 try { 41 // Class.forName("org.gjt.mm.mysql.Driver").newInstance(); 42 Class.forName("com.mysql.jdbc.Driver"); 43 } catch (ClassNotFoundException e1) { 44 e1.printStackTrace(); 45 JOptionPane.showMessageDialog(null, "数据库包未找到"); 46 System.exit(-1); 47 } // .newInstance(); 48 49 try { 50 conn = DriverManager.getConnection( 51 "jdbc:mysql://"+IP+":"+port+"/"+database+"?useUnicode=true&characterEncoding=utf8", user, 52 password);//autoReconnect=true&useUnicode=true&characterEncoding=utf8 53 } catch (SQLException e1) { 54 e1.printStackTrace(); 55 JOptionPane.showMessageDialog(null, "数据库无法连接"); 56 System.exit(-1); 57 } 58 59 try { 60 conn.setAutoCommit(false); 61 } catch (SQLException e1) { 62 63 e1.printStackTrace(); 64 } 65 66 return conn; 67 } 68 public static Statement getSta(){ 69 try { 70 s = conn.createStatement(); 71 } catch (SQLException e1) { 72 73 e1.printStackTrace(); 74 JOptionPane.showMessageDialog(null, "无法建立数据库语句"); 75 System.exit(-1); 76 } 77 return s; 78 } 79 80 81 public static int commit(){ 82 try { 83 conn.commit(); 84 } catch (SQLException e) { 85 86 e.printStackTrace(); 87 JOptionPane.showMessageDialog(null, "对数据库更改无法应用"); 88 89 } 90 return 0; 91 } 92 public static void closeConnect() { 93 try { 94 rs.close(); 95 } catch (SQLException e) { 96 e.printStackTrace(); 97 JOptionPane.showMessageDialog(null, "数据库结果集无法关闭"); 98 99 } 100 try { 101 s.close(); 102 } catch (SQLException e) { 103 104 e.printStackTrace(); 105 JOptionPane.showMessageDialog(null, "数据库语句无法关闭"); 106 107 } 108 109 try { 110 conn.close(); 111 } catch (SQLException e) { 112 113 e.printStackTrace(); 114 JOptionPane.showMessageDialog(null, "与数据库的连接无法关闭"); 115 } 116 /* 117 try { // perform a clean shutdown 118 DriverManager.getConnection("jdbc:derby:;shutdown=true"); 119 120 } catch (SQLException se) { 121 122 if (((se.getErrorCode() == 50000) 123 && ("XJ015".equals(se.getSQLState())))) { 124 // we got the expected exception 125 System.out.println("Derby shut down normally"); 126 // Note that for single database shutdown, the expected 127 // SQL state is "08006", and the error code is 45000. 128 } else { 129 System.err.println("Derby did not shut down normally"); 130 // JOptionPane.showMessageDialog(null, "数据库关闭错误"); 131 se.printStackTrace(); 132 } 133 }*/ 134 } 135 136 }

 

你可能感兴趣的:(爬虫(Java实现))