WebMagic的学习基础:Jsoup的学习(Http基础API和Jsoup基础API)
在学习WebMagic之前,我们需要简单了解关于Jsoup的知识,WebMagic是基于Jsoup的爬虫工具。
下面我会列出关于Jsoup的api的使用。先列出关于Http的一些基本操作
所需要的依赖:
org.jsoup jsoup 1.10.2 junit junit 4.12 test commons-io commons-io 2.4 org.apache.commons commons-lang3 3.7
HTTPGet的使用:
public class HttpGetTest { public static void main(String[] arge){ //创建httpClient对象 CloseableHttpClient httpClient= HttpClients.createDefault(); //创建HTTPGet的对象,设置url访问地址 HttpGet httpGet=new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response =null; //使用Http发送请求,获取response try { response =httpClient.execute(httpGet); //解析响应 if(response.getStatusLine().getStatusCode()==200){ String content=EntityUtils.toString(response.getEntity(),"utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { //关闭连接请求 try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
带参数的Get请求:
public class HttpGetParmTest { public static void main(String[] arge) { //创建httpClient对象 CloseableHttpClient httpClient = HttpClients.createDefault(); //创建URLBuilder URIBuilder uriBuilder = null; try { uriBuilder = new URIBuilder("http://yun.itheima.com/search"); } catch (URISyntaxException e) { e.printStackTrace(); } uriBuilder.setParameter("keys", "Java"); //创建HTTPGet的对象,设置url访问地址 HttpGet httpGet = null; try { httpGet = new HttpGet(uriBuilder.build()); } catch (URISyntaxException e) { e.printStackTrace(); } CloseableHttpResponse response = null; //使用Http发送请求,获取response System.out.println("发送的请求地址:" + httpGet); try { response = httpClient.execute(httpGet); //解析响应 if (response.getStatusLine().getStatusCode() == 200) { String content = EntityUtils.toString(response.getEntity(), "utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); } finally { try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
Post的请求:
与Get请求相似,只需要将Get请求的类改成Post类即可
Post中带参数请求:(添加的参数为表单信息)
public class HttpPostParmTest { public static void main(String[] arge) throws UnsupportedEncodingException { //创建httpClient对象 CloseableHttpClient httpClient= HttpClients.createDefault(); //创建HTTPPost的对象,设置url访问地址 HttpPost httpPost=new HttpPost("http://itcast.cn"); //声明list集合 封装表单中的参数 Listpairs=new ArrayList (); pairs.add(new BasicNameValuePair("keys","Java")); //创建表单中Entit对象 UrlEncodedFormEntity formEntity=new UrlEncodedFormEntity(pairs,"utf8"); //设置表单对象到Post请求中 httpPost.setEntity(formEntity); System.out.println("发送的请求为:"+httpPost); CloseableHttpResponse response =null; //使用Http发送请求,获取response try { response =httpClient.execute(httpPost); //解析响应 if(response.getStatusLine().getStatusCode()==200){ String content=EntityUtils.toString(response.getEntity(),"utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { try { response.close(); } catch (IOException e) { e.printStackTrace(); } try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } } }
连接池的使用:
public class HttpClientPool { public static void main(String[] args) { //创建连接池管理器 PoolingHttpClientConnectionManager clientConnectionManager = new PoolingHttpClientConnectionManager(); //设置连接数 clientConnectionManager.setMaxTotal(100); //设置每个主机的最大连接数 clientConnectionManager.setDefaultMaxPerRoute(10); //使用管理器发起请求 doGet(clientConnectionManager); doGet(clientConnectionManager); } private static void doGet(PoolingHttpClientConnectionManager clientConnectionManager) { //从连接池中获取对象 CloseableHttpClient httpClient = HttpClients.custom().setConnectionManager(clientConnectionManager).build(); HttpGet httpGet = new HttpGet("http://www.itcast.cn"); CloseableHttpResponse response = null; try { response = httpClient.execute(httpGet); if (response.getStatusLine().getStatusCode() == 200) { String content= EntityUtils.toString(response.getEntity(),"utf8"); System.out.println(content.length()); } } catch (IOException e) { e.printStackTrace(); }finally { if(response!=null){ try { response.close(); } catch (IOException e) { e.printStackTrace(); } //httpClient的关闭由连接池管理 } } } }
Get请求携带配置信息:
//配置请求信息 RequestConfig config=RequestConfig.custom().setConnectTimeout(1000)//创建连接的最长时间 .setConnectionRequestTimeout(500)//设置获取连接的最长时间 .setSocketTimeout(10*1000)//设置数据传输的最长时间 .build(); //给设置的请求信息加进去 httpGet.setConfig(config);
Jsoup基础API使用:(注释很详细,就不做解释了)
public class JsoupFirsttest { @Test public void testUrl() throws Exception { //解析Url地址,第一个参数是访问额url,第二个参数是访问时候的超时时间 Document doc = Jsoup.parse(new URL("http://www.itcast.cn"), 1000); //使用标签选择器 String title = doc.getElementsByTag("title").first().text(); //打印 System.out.println(title); } @Test public void testString() throws Exception { //读取文件,获取字符串 String content = FileUtils.readFileToString(new File("C:\\Users\\SuperMan\\Desktop\\test.html"), "utf8"); //解析字符串 Document doc = Jsoup.parse(content); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void testFile() throws Exception { //解析文件 Document doc = Jsoup.parse(new File("C:\\Users\\SuperMan\\Desktop\\test.html"), "utf8"); String title = doc.getElementsByTag("title").first().text(); System.out.println(title); } @Test public void TestDom() throws Exception { Document doc = Jsoup.parse(new File("C:\\Users\\SuperMan\\Desktop\\test.html"), "utf8"); //获取元素 //通过ID获取元素内容 Element element = doc.getElementById("city_bj"); //通过标签获取元素 Element element1 = doc.getElementsByTag("span").get(12); //通过class获取元素 Element element2 = doc.getElementsByClass("fdnav").first(); //通过属性获取 Element element3 = doc.getElementsByAttribute("abc").first(); Element element4 = doc.getElementsByAttributeValue("href", "http://yun.itheima.com/").first(); //打印 System.out.println(element.text()); System.out.println(element1.text()); System.out.println(element2.text()); System.out.println(element3.text()); System.out.println(element4.text()); } @Test public void testData()throws Exception{ Document doc = Jsoup.parse(new File("C:\\Users\\SuperMan\\Desktop\\test.html"), "utf8"); //获取元素 Element element=doc.getElementById("cy"); String str=""; //从元素中获取ID str=element.id(); System.out.println(str); //获取ClassName Sets =element.classNames(); for (String string:s ) { System.out.println(string); } //获取attr str=element.attr("id"); //获取所有属性 Attributes attribute=element.attributes(); System.out.println(attribute.toString()); //获取文本内容 str=element.text(); System.out.println(str); } }