for(int i=0;i<205;i++) { String sql="select * from cat_keyword order by id limit "+i*10001+",10000"; String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000"; rs=stmt.executeQuery(best); go.filt(rs); }
经试验,第一条sql语句的执行效率是明显不如第二条的。
String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";
这条SQL语句尤其适用于offset特别大的情景,因为200多万条记录,到中后期,这个offset是很大的,使用未经优化的SQL语句,到后期是越来越慢。
另外limit的取值也很有考究,我试过1000,10000,70000,100000的,最后发现limit取10000的时候速度是最快的,这个跟数据的大小,电脑以及数据库内存的分配都有一定的关系,大家可以根据具体情况具体讨论。我这里仅仅提一下需要注意,可以优化的point。
我真正想说的是我想的另外一种解决方案,运用JDBC
下面给大家说下我这个程序的构造,首先是一个实现将需要删除和修改的词插入另一张表的方法
//将找到的违规词插入cat_garbage表里 public void insert(ResultSet rs) throws SQLException { Connection conn=null; Statement stmt=null; try{ conn=Good.getConnection(); stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE , ResultSet.CONCUR_UPDATABLE); String query="select * from cat_garbage where id='"+rs.getInt("id")+"'"; ResultSet rst=stmt.executeQuery(query); if(!rst.next()) { String sql1="insert into cat_garbage(id,cid,name,keyword) values('"+rs.getInt("id")+"','"+rs.getInt("cid")+"','"+rs.getString("name")+"','"+rs.getString("keyword")+"')"; stmt.executeUpdate(sql1); } } catch(SQLException|ClassNotFoundException|IOException e) { e.printStackTrace(); } finally { if(stmt!= null) stmt.close(); if(conn!= null) conn.close(); } }然后是一个将可以修复的词修复的方法
public void modify(ResultSet rs,String str,String reg) throws SQLException { Connection conn=null; Statement stmt=null; Statement stmt1=null; try{ conn=Good.getConnection(); stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE , ResultSet.CONCUR_UPDATABLE); stmt1= conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE , ResultSet.CONCUR_UPDATABLE); String ok=null; if(str.indexOf(reg)==str.length()-1) { ok=str.substring(0,str.indexOf(reg)); System.out.println(ok); System.out.println(ok); System.out.println(1); } else if(str.indexOf(reg)==0) { ok=str.substring(1); while(ok.indexOf(reg)!=-1) ok=ok.substring(1); System.out.println(2); } else { System.out.println(3); System.out.println(str); ok=str.substring(0,str.indexOf(reg)); } System.out.println(ok); new Good().insert(rs); //String query="select * from cat_keyword where cid='"+rs.getInt("cid")+"'and keyword='"+ok+"'"; String sql="update cat_garbage1 set new='"+ok+"' where id='"+rs.getInt(1)+"'"; //String sql1="update cat_keyword set keyword='"+ok+"' where id='"+rs.getInt("id")+"'"; stmt.executeUpdate(sql); } catch(SQLException|ClassNotFoundException|IOException e) { e.printStackTrace(); } finally { if(stmt!= null) stmt.close(); if(stmt1!= null) stmt1.close(); if(conn!= null) conn.close(); } }
public void filt(ResultSet rs) { try { String str; double a=0;double b=0;double c=0;double d=0; int num=0; Good good=new Good(); while(rs.next()) { str=rs.getString("keyword"); if(str.equals(",")|str.equals(".")|str.equals(":")|str.equals("-")|str.equals("=")|str.equals("+")|str.equals("*")|str.equals("/")|str.equals("?")|str.equals("|")|str.equals("\\")|str.equals("%")|str.equals("#")|str.equals("!")||str.equals("~")||str.equals("、")) { good.insert(rs); a++; } //字符串后面带逗号 if(rule(str,"^\\S+[,]$").matches()) { good.modify(rs,str,","); num++; } //字符串后面带句号 if(rule(str,"^\\S+[.]$").matches()) { good.modify(rs,str,"."); num++; } //字符串前面带句号 if(rule(str,"^[.]\\S+$").matches()) { System.out.println(123); if(str.indexOf(".net")==-1) { if(str.indexOf(".com")==-1) { if(str.indexOf(".cn")==-1) { if(str.indexOf(".org")==-1) { good.modify(rs, str, "."); num++; } } } } } //字符串后面带- if(rule(str,"^\\S+[-]$").matches()) { good.modify(rs, str, "-"); num++; } //字符串前面带- if(rule(str,"^[-]\\S+$").matches()) { good.modify(rs, str, "-"); num++; } //字符串前面带冒号 if(rule(str,"^[:]\\S+$").matches()) { good.modify(rs, str, ":"); num++; } //字符串后面带冒号 if(rule(str,"^\\S+[:]$").matches()) { good.modify(rs, str, ":"); num++; } //字符串后面带/ if(rule(str,"^\\S+[/]$").matches()) { good.modify(rs, str, "/"); num++; } //字符串前面带/ if(rule(str,"^[/]\\S+$").matches()) { good.modify(rs, str, "/"); num++; } //字符串后面带? if(rule(str,"^\\S+[?]$").matches()) { good.modify(rs, str, "?"); num++; } //字符串前面带? if(rule(str,"^[?]\\S+$").matches()) { good.modify(rs, str, "?"); num++; } //字符串前面带| if(rule(str,"^[|]\\S+$").matches()) { good.modify(rs, str, "|"); num++; } //字符串后面带| if(rule(str,"^\\S+[|]$").matches()) { good.modify(rs, str, "|"); num++; } //字符串前面带( if(rule(str,"^[(][\\S\\s]+$").matches()) { if(rule(str,"^[(]\\S+[)]\\S*$").matches()) { str=str.replaceAll("\\)", " "); good.modify(rs, str, "("); num++; } else { good.modify(rs, str, "("); num++; } } //字符串前面带[ if(rule(str,"^[\\[][\\S\\s]+$").matches()) { if(rule(str,"^[\\[]\\S+[\\]]\\S*$").matches()) { str=str.replaceAll("\\]", " "); good.modify(rs, str, "["); num++; } else { good.modify(rs, str, "["); num++; } } //字符串后面带)或者里面带() if(rule(str,"^\\S+\\s*\\S+[)]$").matches()) { System.out.println("qwe"); if(rule(str,"^\\S+[(]\\S+[)]$").matches()) { str=str.replaceAll("\\(", " "); good.modify(rs, str, ")"); num++; } else if(rule(str,"^\\S+\\s*[(]\\S+[)]$").matches()) { str=str.replaceAll("\\(", ""); good.modify(rs, str, ")"); num++; } else { good.modify(rs, str, ")"); num++; } } //字符串后面带]或者里面带[] if(rule(str,"^\\S+\\s*\\S+[\\]]$").matches()) { System.out.println("qwe"); if(rule(str,"^\\S+[\\[]\\S+[\\]]$").matches()) { str=str.replaceAll("\\[", " "); good.modify(rs, str, "]"); num++; } else if(rule(str,"^\\S+\\s*[\\[]\\S+[\\]]$").matches()) { str=str.replaceAll("\\[", ""); good.modify(rs, str, "]"); num++; } else { good.modify(rs, str, "]"); num++; } } //字符串内包含书名号 if(rule(str,"^[\\s\\S]*[《][\\s\\S]+[》]").matches()) { str=str.replaceAll("》", " "); good.modify(rs, str, "《"); num++; } //字符串内包含中文() if(rule(str,"^[\\s\\S]*[(][\\s\\S]+[)]").matches()) { str=str.replaceAll("(", " "); good.modify(rs, str, ")"); num++; } //纯数字 if(rule(str,"^[1-9]*|0$").matches()) { good.insert(rs); b++; } //单字母 if(rule(str,"^[a-zA-Z]$").matches()) { int flag=0; String shit=rs.getString(3); for(int i=0;i<shit.length();i++) { if(str.charAt(0)!=shit.charAt(i)) flag++; if(flag==shit.length()) { good.insert(rs); c++; } } } //单汉字 if(rule(str,"^[\u4e00-\u9fa5]$").matches()) { int flag=0; String shit=rs.getString(3); for(int i=0;i<shit.length();i++) { if(str.charAt(0)!=shit.charAt(i)) flag++; if(flag==shit.length()) { good.insert(rs); d++; } } } } System.out.println("单标点百分比:"+100*a/2046036+"%"); System.out.println("纯数字百分比:"+100*b/2046036+"%"); System.out.println("单字母百分比:"+100*c/2046036+"%"); System.out.println("单汉字百分比:"+100*d/2046036+"%"); System.out.println(num); } catch(SQLException e) { e.printStackTrace(); } }下面是程序的主方法
public static void main(String[] args) throws SQLException { Connection conn=null; Statement stmt=null; try{ long a=System.currentTimeMillis(); ResultSet rs; conn=Good.getConnection(); stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE , ResultSet.CONCUR_UPDATABLE); Good go=new Good(); /*for(int i=0;i<205;i++) { //String sql="select * from cat_keyword order by id limit "+i*10001+",10000"; String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000"; rs=stmt.executeQuery(best); go.filt(rs); }*/ String sql="select * from cat_keyword"; rs=stmt.executeQuery(sql); go.filt(rs); long b=System.currentTimeMillis(); System.out.println("耗时: "+(b-a)/60000+"分钟"); } catch(ClassNotFoundException|SQLException|IOException e) { e.printStackTrace(); } finally { if(stmt!= null) stmt.close(); if(conn!= null) conn.close(); } }用一个SQL语句,将整个表的数据一次性作为参数传给负责取词过滤的filt()方法,然后利用ResultSet rs这个指针依次向下对每个词进行处理, 这个方案运行下来处理完全表204万记录只需11分钟,而是用优化之后的limit语句获得的结果集再传给filt()方法那个方案,却需要26分钟。可见使用JDBC,直接传入所有数据的结果集,比使用limit多次限量导入数据要快的多,而且不止是一倍的关系。前者11分钟完胜后者的26分钟。