这天遇到这样一个需求:这种页面数据可以抓取吗?
随后提供了账号、密码和网站地址:
帐号:kytj1
密码:******************
登陆地址:http://student.tiaoji.kaoyan.com/tjadm
1、使用Fiddler4分析http请求交互方式,包括数据发送方式(POST或GET),携带参数等,获得返回的数据信息
2、用Android程序模拟HTTP请求
3、用Java解析HTML代码,提取出对应的姓名、报考学校、报考专业、分数、联系电话、发布时间等字段
4、把txt文件导入到Excel里,待进一步处理。
登陆地址:http://student.tiaoji.kaoyan.com/tjadm
可以看到HOST、URL、POST方式以及明文密码
登录成功后,网页数据显示为
可以看到请求的HOST以及URL,方式为GET,返回的数据也可以在body体中获取到。
返回的HTML页面代码为(选取了部分)
考研调剂中心_考研调剂意向发布系统_考研调剂_考研网(kaoyan.com)
package com.example.testget;
import org.apache.http.HttpEntity;
import android.content.Context;
import com.loopj.android.http.AsyncHttpClient;
import com.loopj.android.http.AsyncHttpResponseHandler;
import com.loopj.android.http.RequestParams;
public class XcAsyncHttpClientUtil {
public static final String BASE_URL = "http://ntiaoji.kaoyan.com";
public static final String LOGIN_URL = "/tjadm/login";
public static final String INDEX1 = "/tjadm/1.html";
private static AsyncHttpClient client = new AsyncHttpClient();
public static void get(String url, RequestParams params,
AsyncHttpResponseHandler responseHandler) {
client.get(getAbsoluteUrl(url), params, responseHandler);
}
public static void post(String url, RequestParams params,
AsyncHttpResponseHandler responseHandler) {
client.post(getAbsoluteUrl(url), params, responseHandler);
}
public static void post(Context context, String url, HttpEntity entity,
AsyncHttpResponseHandler responseHandler) {
client.post(context, getAbsoluteUrl(url), entity, "", responseHandler);
}
public static String getAbsoluteUrl(String relativeUrl) {
return BASE_URL + relativeUrl;
}
}
效果图如下:
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
btn = (Button) findViewById(R.id.btn);
btn.setOnClickListener(new OnClickListener() {
@Override
public void onClick(View v) {
dologin();
}
});
btn1 = (Button) findViewById(R.id.btn1);
btn1.setOnClickListener(new OnClickListener() {
@Override
public void onClick(View v) {
doGetData();
}
});
}
private void dologin() {
RequestParams params = new RequestParams();
params.put("username", "kytj1");
params.put("password", "***********");
XcAsyncHttpClientUtil.post(XcAsyncHttpClientUtil.LOGIN_URL, params,
new AsyncHttpResponseHandler() {
@Override
public void onSuccess(int statusCode, Header[] headers,
byte[] responseBody) {
try {
String jsonString = new String(responseBody,
"UTF-8");
Log.e("TAG", jsonString);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
@Override
public void onFailure(int statusCode, Header[] headers,
byte[] responseBody, Throwable error) {
Log.e("Login", "onFailure");
}
});
}
protected void doGetData() {
RequestParams params = new RequestParams();
XcAsyncHttpClientUtil.get("/tjadm/" + page + ".html", params,
new AsyncHttpResponseHandler() {
@Override
public void onSuccess(int statusCode, Header[] headers,
byte[] responseBody) {
try {
String jsonString = new String(responseBody,
"UTF-8");
parse(jsonString);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
@Override
public void onFailure(int statusCode, Header[] headers,
byte[] responseBody, Throwable error) {
}
});
}
protected void parse(String html) {
Document doc = Jsoup.parse(html);
Element tiaojiTab = doc.select("table.tiaoji-tab").first();
Elements lists = tiaojiTab.getElementsByTag("tr");
int size = lists.size();
for (int i = 1; i < size; i++) {
Element item = lists.get(i);
Elements els = item.getElementsByTag("td");
String all = "";
for (int j = 0; j < els.size(); j++) {
Element value = els.get(j);
String text = value.text();
all = all + text + "#";
}
initData(all);
Log.e("tag", all);
}
page++;
if (page < totalsize + 1) {
doGetData();
} else {
page = 1;
}
}
doc.select("table.tiaoji-tab").first();
从整个HTML文档里取出要解析的内容信息,根据“tr”取得元素组,从第2条开始取数据,调用for循环。
page++;
if (page < totalsize + 1) {
doGetData();
} else {
page = 1;
}
继续取下一页,设置的totalSize=200,即每运行一次程序,抓取200页数据。
private void initData(String msg) {
String filePath = "/sdcard/Test/";
String fileName = "tiaoji.txt";
makeFilePath(filePath, fileName);
writeTxtToFile(msg, filePath, fileName);
}
// 将字符串写入到文本文件中
public void writeTxtToFile(String strcontent, String filePath,
String fileName) {
// 生成文件夹之后,再生成文件,不然会出错
String strFilePath = filePath + fileName;
// 每次写入时,都换行写
String strContent = strcontent + "\r\n";
try {
File file = new File(strFilePath);
if (!file.exists()) {
Log.d("TestFile", "Create the file:" + strFilePath);
file.getParentFile().mkdirs();
file.createNewFile();
}
RandomAccessFile raf = new RandomAccessFile(file, "rwd");
raf.seek(file.length());
raf.write(strContent.getBytes());
raf.close();
} catch (Exception e) {
Log.e("TestFile", "Error on write File:" + e);
}
}
// 生成文件
public File makeFilePath(String filePath, String fileName) {
File file = null;
makeRootDirectory(filePath);
try {
file = new File(filePath + fileName);
if (!file.exists()) {
file.createNewFile();
}
} catch (Exception e) {
e.printStackTrace();
}
return file;
}
// 生成文件夹
public static void makeRootDirectory(String filePath) {
File file = null;
try {
file = new File(filePath);
if (!file.exists()) {
file.mkdir();
}
} catch (Exception e) {
Log.i("error:", e + "");
}
}
package com.example.testget;
import java.io.File;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import org.apache.http.Header;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import android.app.Activity;
import android.os.Bundle;
import android.util.Log;
import android.view.View;
import android.view.View.OnClickListener;
import android.widget.Button;
import com.loopj.android.http.AsyncHttpResponseHandler;
import com.loopj.android.http.RequestParams;
public class MainActivity extends Activity {
private Button btn, btn1;
private int page = 1;
private static final int totalsize = 200;
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
btn = (Button) findViewById(R.id.btn);
btn.setOnClickListener(new OnClickListener() {
@Override
public void onClick(View v) {
dologin();
}
});
btn1 = (Button) findViewById(R.id.btn1);
btn1.setOnClickListener(new OnClickListener() {
@Override
public void onClick(View v) {
doGetData();
}
});
}
private void dologin() {
RequestParams params = new RequestParams();
params.put("username", "kytj1");
params.put("password", "************");
XcAsyncHttpClientUtil.post(XcAsyncHttpClientUtil.LOGIN_URL, params,
new AsyncHttpResponseHandler() {
@Override
public void onSuccess(int statusCode, Header[] headers,
byte[] responseBody) {
try {
String jsonString = new String(responseBody,
"UTF-8");
Log.e("TAG", jsonString);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
@Override
public void onFailure(int statusCode, Header[] headers,
byte[] responseBody, Throwable error) {
Log.e("Login", "onFailure");
}
});
}
protected void doGetData() {
RequestParams params = new RequestParams();
XcAsyncHttpClientUtil.get("/tjadm/" + page + ".html", params,
new AsyncHttpResponseHandler() {
@Override
public void onSuccess(int statusCode, Header[] headers,
byte[] responseBody) {
try {
String jsonString = new String(responseBody,
"UTF-8");
parse(jsonString);
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
}
@Override
public void onFailure(int statusCode, Header[] headers,
byte[] responseBody, Throwable error) {
}
});
}
protected void parse(String html) {
Document doc = Jsoup.parse(html);
Element tiaojiTab = doc.select("table.tiaoji-tab").first();
Elements lists = tiaojiTab.getElementsByTag("tr");
int size = lists.size();
for (int i = 1; i < size; i++) {
Element item = lists.get(i);
Elements els = item.getElementsByTag("td");
String all = "";
for (int j = 0; j < els.size(); j++) {
Element value = els.get(j);
String text = value.text();
all = all + text + "#";
}
initData(all);
Log.e("tag", all);
}
page++;
if (page < totalsize + 1) {
doGetData();
} else {
page = 1;
}
}
private void initData(String msg) {
String filePath = "/sdcard/Test/";
String fileName = "tiaoji.txt";
makeFilePath(filePath, fileName);
writeTxtToFile(msg, filePath, fileName);
}
// 将字符串写入到文本文件中
public void writeTxtToFile(String strcontent, String filePath,
String fileName) {
// 生成文件夹之后,再生成文件,不然会出错
String strFilePath = filePath + fileName;
// 每次写入时,都换行写
String strContent = strcontent + "\r\n";
try {
File file = new File(strFilePath);
if (!file.exists()) {
Log.d("TestFile", "Create the file:" + strFilePath);
file.getParentFile().mkdirs();
file.createNewFile();
}
RandomAccessFile raf = new RandomAccessFile(file, "rwd");
raf.seek(file.length());
raf.write(strContent.getBytes());
raf.close();
} catch (Exception e) {
Log.e("TestFile", "Error on write File:" + e);
}
}
// 生成文件
public File makeFilePath(String filePath, String fileName) {
File file = null;
makeRootDirectory(filePath);
try {
file = new File(filePath + fileName);
if (!file.exists()) {
file.createNewFile();
}
} catch (Exception e) {
e.printStackTrace();
}
return file;
}
// 生成文件夹
public static void makeRootDirectory(String filePath) {
File file = null;
try {
file = new File(filePath);
if (!file.exists()) {
file.mkdir();
}
} catch (Exception e) {
Log.i("error:", e + "");
}
}
}