目录结构
–TranSpider.java (用于主要的爬取相关
–TranBean.java (爬取之后的存储结构
–Test.java (用于测试
package com.lilutong.trans;
import java.sql.Timestamp;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
public class TransBean {
String word=null;
public Timestamp timestamp=null;
List trans=null;
public TransBean() {
word="";
String nowTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date());//将时间格式转换成符合Timestamp要求的格式.
timestamp=Timestamp.valueOf(nowTime);
trans=new ArrayList();
}
public void addTrans(String tran) {
this.trans.add(tran);
}
//以下为自动生成的get+set+tostring
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public List getTrans() {
return trans;
}
public void setTrans(List trans) {
this.trans = trans;
}
public void addTrans(String tran) {
this.trans.add(tran);
}
public Timestamp getTimestamp() {
return timestamp;
}
public void setTimestamp(Timestamp timestamp) {
this.timestamp = timestamp;
}
@Override
public String toString() {
return "TransBean [word=" + word + ", timestamp=" + timestamp + ", trans=" + trans + "]";
}
}
package com.lilutong.trans;
import java.io.*;
import java.net.*;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicHeader;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
import org.apache.http.util.EntityUtils;
import org.apache.jasper.tagplugins.jstl.core.Url;
import javafx.scene.chart.PieChart.Data;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
/**
* java实现爬虫
*/
public class TranSpider {
URL targetUrl=null;
public TranSpider() throws MalformedURLException{
targetUrl=new URL("http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule&sessionFrom=");
}
//获取即将发送的message
public List getMessage(String word) throws Exception {
//获取复杂加密结构------时间戳结构
long timestamp=(new Date().getTime());
int random=(int)(Math.random()*10);
String salt=String.valueOf(timestamp+random);
//获取复杂加密结构----sign
String u = "fanyideskweb";
String d = word;
String f = salt;
String c = "rY0D^0\'nM0}g5Mm1z%1G4";
String info=(u+d+f+c);
byte[] infobyte=info.getBytes();
MessageDigest messageDigest=MessageDigest.getInstance("MD5");
messageDigest.digest(info.getBytes("utf-8"));
// 使用指定的字节更新摘要
messageDigest.update(infobyte);
// 获得密文
byte[] md = messageDigest.digest();
// 把密文转换成十六进制的字符串形式
char hexDigits[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
int j = md.length;
char endSign[] = new char[j * 2];
int k = 0;
for (int i = 0; i < j; i++) {
byte byte0 = md[i];
endSign[k++] = hexDigits[byte0 >>> 4 & 0xf];
endSign[k++] = hexDigits[byte0 & 0xf];
}
//获取16进制 md5加密 的sign
String sign=new String(endSign).toLowerCase();
List nameValuePairs = new ArrayList();
nameValuePairs.add(new BasicNameValuePair("i",word));
nameValuePairs.add(new BasicNameValuePair("from","AUTO"));
nameValuePairs.add(new BasicNameValuePair("to","AUTO"));
nameValuePairs.add(new BasicNameValuePair("smartresult","dict"));
nameValuePairs.add(new BasicNameValuePair("client","fanyideskweb"));
nameValuePairs.add(new BasicNameValuePair("salt",salt));
nameValuePairs.add(new BasicNameValuePair("sign",sign));
nameValuePairs.add(new BasicNameValuePair("doctype","json"));
nameValuePairs.add(new BasicNameValuePair("version","2.1"));
nameValuePairs.add(new BasicNameValuePair("keyfrom","fanyi.web"));
nameValuePairs.add(new BasicNameValuePair("action","FY_BY_CLlCKBUTTON"));
nameValuePairs.add(new BasicNameValuePair("typoResult","true"));
return nameValuePairs;
}
public Header[] getHeader() {
Header[] headers=new Header[11];
headers[0]=new BasicHeader("Accept", "application/json, text/javascript, */*; q=0.01");
//headers[1]=new BasicHeader("Accept-Encoding", "gzip, deflate");
headers[1]=new BasicHeader("Accept-Encoding", "deflate");
headers[2]=new BasicHeader("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6");
headers[3]=new BasicHeader("Connection", "keep-alive");
headers[4]=new BasicHeader("X-Requested-With", "XMLHttpRequest");
headers[5]=new BasicHeader("Content-Type", "application/x-www-form-urlencoded; charset=UTF-8");
headers[6]=new BasicHeader("Cookie", "JSESSIONID=aaalHNVSigPD8-hsnhf3v; SESSION_FROM_COOKIE=fanyiweb; [email protected]; _ntes_nnid=1892114ba72ae7f868a29a4db02914a0,1502250589343; _dict_cpm_show=1502250589350; _dict_cpm_close=1; OUTFOX_SEARCH_USER_ID_NCOO=1688640113.572293; ___rl__test__cookies=1502251640921");
headers[7]=new BasicHeader("Host", "fanyi.youdao.com");
headers[8]=new BasicHeader("Origin", "http://fanyi.youdao.com");
headers[9]=new BasicHeader("Referer", "http://fanyi.youdao.com");
headers[10]=new BasicHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36");
//headers[11]=new BasicHeader("Content-Length", "205");
return headers;
}
//格式整理
public TransBean getForm(String info) {
TransBean transBean=new TransBean();
//string-->json化
JSONObject jsonObject=JSONObject.fromObject(info);
//主要爵士格式转化
JSONArray maininfo=JSONArray.fromObject(jsonObject.getString("translateResult"));
maininfo=((JSONArray) maininfo.get(0));
JSONObject mainTran =JSONObject.fromObject(maininfo.get(0).toString());
transBean.setWord(mainTran.getString("src"));
transBean.addTrans(mainTran.getString("tgt"));
//次要格式转化
JSONObject smarkinfo=JSONObject.fromObject(jsonObject.getString("smartResult"));
JSONArray smarkTran=JSONArray.fromObject(smarkinfo.get("entries"));
for(Object object:smarkTran) {
if(object!=""||object!=null) {
transBean.addTrans(object.toString().trim());
}
}
return transBean;
}
//用于流程操作 : 传入单词-->单词相关的注解+time
public TransBean TranProcess(String word) throws Exception {
//发送-接收服务器
HttpClient httpClient=new DefaultHttpClient();
HttpPost httpPost =new HttpPost();
HttpResponse httpResponse=null;
//设置发送message 执行发送
httpPost.setURI(this.targetUrl.toURI());
List nameValuePairs=this.getMessage("good");
httpPost.setEntity(new UrlEncodedFormEntity(nameValuePairs,"utf-8"));
httpPost.setHeaders(this.getHeader());
//执行发送
httpResponse=httpClient.execute(httpPost);
//获取实体返回结果
HttpEntity entity=httpResponse.getEntity();
String info =EntityUtils.toString(entity, "utf-8");
//将结果进行格式转换
TransBean transBean=this.getForm(info);
return transBean;
}
}
TranSpider目录结构
– public TranSpider() —-写好相关使用参数(这里仅用了URL)
– public List getMessage(String word) (填充相关的传输数据–发送的,翻译文字,加密格式(反爬虫,反盗用)等)
– public Header[] getHeader() (请求页面的头(常用反爬虫))
– public TransBean getForm(String info) (获取后的信息格式处理)
– public TransBean TranProcess(String word) (接口-把流程走一遍 –传回索要的数据—+格式)
Test相关
package com.lilutong.trans;
import java.net.MalformedURLException;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.HttpClient;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.util.EntityUtils;
public class Test {
public static void main(String args[]) throws Exception {
TranSpider tranSpider=new TranSpider();
TransBean transBean=tranSpider.TranProcess("word");
System.out.println(transBean);
}
}
声明对象+调用 就可以翻译了~~
附加jar包
忘了最重要的:
接口破译是看的知了课堂的一个爬虫视频,自己可以搜一下