软件工程第四周进度总结

本周学习了java爬虫的相关知识。

一、Get请求

public class Web {
    static final Log logger = LogFactory.getLog(Web.class);
      public static void main(String[] args) throws Exception  {
          

          //1、打开浏览器,创建httpClient对象
          CloseableHttpClient httpClient = HttpClients.createDefault();
          
          //创建URIBuilder
          URIBuilder uribuilder= new URIBuilder("https://www.qidian.com");
          //设置参数:参数名+参数值,可设置多个
          uribuilder.setParameter("key","xuanhuan").setParameter("", "");
          
        //2、输入网址,发起请求,创建httpGet对象
          HttpGet httpGet= new HttpGet(uribuilder.build());
          System.out.println("发起请求的信息:"+httpGet);
          
          CloseableHttpResponse response=null;
          try {
          //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
           response = httpClient.execute(httpGet);
          //解析响应,获取数据
          //判断状态码是否为两百
          if(response.getStatusLine().getStatusCode()==200) {
              HttpEntity httpEntity = response.getEntity();
              String content = EntityUtils.toString(httpEntity, "utf8");
              System.out.println(content.length());
              System.out.println(content);
          }
          }catch(Exception e) {
              e.printStackTrace();
          }finally {
    
              try {
                //关闭response
                response.close();
                //关闭httpClient
                httpClient.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
          }
          
    }


}

二、Post请求

public class Web {
    static final Log logger = LogFactory.getLog(Web.class);
      public static void main(String[] args) throws Exception  {
          

          //1、打开浏览器,创建httpClient对象
          CloseableHttpClient httpClient = HttpClients.createDefault();
          
       
          //2、输入网址,发起请求,创建httpPost对象
          HttpPost httpPost= new HttpPost("https://www.baidu.com/index.php");
          System.out.println("发起请求的信息:"+httpPost);
          
          //Post使用,声明List集合,封装表单中的参数
          List params= new ArrayList();
          params.add(new BasicNameValuePair("",""));
          
          //创建表单的Entity对象,第一个参数是封装好的参数,第二个是编码
          UrlEncodedFormEntity formEntity= new UrlEncodedFormEntity(params,"utf8");
          
          //设置表单的Entity对象到Post请求中
          httpPost.setEntity(formEntity);
          

          
          CloseableHttpResponse response=null;
          try {
          //3、按回车,发起请求,返回响应,使用httpClient对象发起请求
           response = httpClient.execute(httpPost);
          //解析响应,获取数据
          //判断状态码是否为两百
          if(response.getStatusLine().getStatusCode()==200) {
              HttpEntity httpEntity = response.getEntity();
              String content = EntityUtils.toString(httpEntity, "utf8");
              System.out.println(content.length());
//              System.out.println(content);
          }else {
              System.out.println("请求失败"+response);
          }
          }catch(Exception e) {
              e.printStackTrace();
          }finally {
    
              try {
                //关闭response
                response.close();
                //关闭httpClient
                httpClient.close();
            } catch (IOException e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
          }
          
    }


}

三、Jsoup解析HTML获取DOM

public class Jsouputil {

    public static void main(String[] args) throws Exception {
        testUrl();
        testString();
    }
    /**
     * 解析URL
     * @throws Exception
     */
    public static  void testUrl() throws Exception {
        //解析URL,第一个参数是URL,第二个是访问的超时时间
        Document doc = Jsoup.parse(new URL("https://www.qidian.com"), 1000);
        //使用标签选择器,获取title标签里的内容
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
        
    }
    /**
     * 解析字符串
     */
    public static  void testString() throws Exception {
        HttpClientPool httpClient =new HttpClientPool();
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        //获取网页HTML字符串
        String content=httpClient.doGet(cm);
        
        //解析字符串
        Document doc = Jsoup.parse(content);
        String title = doc.getElementsByTag("title").first().text();
        System.out.println(title);
    }
}

四、DOM的方式获取元素

public static void testDom()throws Exception{
        //获取Document对象
        HttpClientPool httpClient =new HttpClientPool();
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        //获取网页HTML字符串
        String content=httpClient.doGet(cm);
        
        //解析字符串
        Document doc = Jsoup.parse(content);
        
        
//        1、根据id查询元素getElementById
        Element elementById = doc.getElementById("overseas_tit");
        System.out.println(elementById.text());
//        2、根据标签获取元素getElementsByTag
        Elements elementsByTag = doc.getElementsByTag("span");
        System.out.println(elementsByTag.text());
//        3、根据class获取元素getElementsByClass
        Elements elementsByClass = doc.getElementsByClass("chart_table_th");
        System.out.println(elementsByClass.text());
//        4、根据属性获取元素getElementsByAttribute
        Elements elementsByAttribute = doc.getElementsByAttribute("src");
        Elements elementsByAttributeValue = doc.getElementsByAttributeValue("class", "chart_table_name");
        System.out.println(elementsByAttribute);
        System.out.println(elementsByAttributeValue.text());
    }

五、获取元素中的数据

public static void testData()throws Exception{
        //获取Document对象
        HttpClientPool httpClient =new HttpClientPool();
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        //获取网页HTML字符串
        String content=httpClient.doGet(cm);
                
        //解析字符串
        Document doc = Jsoup.parse(content);
        Element element = doc.getElementById("overseas_tit");
        System.out.println(element);
        String str=null;
        //获取元素中的内容
        
//        //获取id
//        str=element.id();
//        System.out.println("id:"+str);
//        //获取className
//        str=element.className();
//        Set classSet=element.classNames();
//        for(String s:classSet) {
//            System.out.println(s);
//        }
//        System.out.println("className:"+str);
//        //获取属性的值attr
//        str=element.attr("class");
//        System.out.println(str);
        //获取所有属性attributes
        Attributes attributes = element.attributes();
        System.out.println(attributes.toString());
        
        //获取文本内容
        str=element.text();
        System.out.println(str);
    }

六、Selector选择器获取元素

    /**
     * 使用Selector选择器获取元素
     */
    public static void testSelector()throws Exception{
        //获取Document对象
        HttpClientPool httpClient =new HttpClientPool();
        //创建连接池管理器
        PoolingHttpClientConnectionManager cm =new  PoolingHttpClientConnectionManager();
        //获取网页HTML字符串
        String content=httpClient.doGet(cm);
                        
        //解析字符串
        Document doc = Jsoup.parse(content);
        
//        //tagName,通过标签查找元素
//        Elements elements = doc.select("span");
//        for(Element element:elements) {
//            System.out.println(element.text());
//        }
//        
        
//        //#id,通过id查找
//        Element e = doc.select("#overseas_tit").first();
//        System.out.println(e.text());
//        
//        
//        //.class,通过class查找
//        Element element = doc.select(".chart_table_name").first();
//        System.out.println(element.text());
//        
//        
//        //[attribute],利用属性获取
//        Element element = doc.select("[class]").first();
//        
//        System.out.println(element.text());
        
        
        //[attr=value],利用属性获取
        Elements element = doc.select("[class=chart_table_name]");
                
        System.out.println(element.text());
        
        //el#id:元素+id,h3#city_bj
        //el.class:元素+class
        //el[attr]:元素+属性名
        //任意组合
        //ancestor child:查找某个元素下子元素
        //parent > child:查找某个父元素下的直接子元素
        //parent >  *:查找某个父元素下的所有子元素
    }

 

你可能感兴趣的:(软件工程第四周进度总结)