笔记:Android用jsoup抓取网页HTML解析数据

( jsoup入门 ) 做个笔记,方便以后忘了可以翻笔记,这里只为测试而测试,其他问题不考虑,只考虑实现入门学习,还有就是这里只用了select抓取(因为用的顺手嘛,也可以用getElementXxxx()的 ),由于没时间这个网页没抓完,只抓取一部分,以后有时间再搞吧

直接复制粘贴到工程中就可以看效果了

哦对了,最近有看过Charles一些文章,Charles是一个Mac和Windows平台都可以使用的抓包工具,有空仔细研究研究

jsoup官方文档
https://jsoup.org/cookbook/
中文文档
http://www.open-open.com/jsoup/
下载jar包地址
http://jsoup.org/download
抓取地址
这里抓取的是泡在网上的日子
http://www.jcodecraeer.com

准备工作

1、Android studio的app下的build.gradle中
依赖这段代码 :compile 'org.jsoup:jsoup:1.11.3' 然后Sync now同步代码
现在官网的最新版本就是1.11.3

查看网页
右键(泡在网上的日子)--检查

logo 导航栏1

笔记:Android用jsoup抓取网页HTML解析数据_第1张图片
1524884602(1).jpg

导航栏1_1


笔记:Android用jsoup抓取网页HTML解析数据_第2张图片
1524884691(1).jpg

排行榜


笔记:Android用jsoup抓取网页HTML解析数据_第3张图片
1524884778(1).jpg

内容1


笔记:Android用jsoup抓取网页HTML解析数据_第4张图片
1524884829(1).jpg

banner


笔记:Android用jsoup抓取网页HTML解析数据_第5张图片
1524884931(1).jpg

MainActivity.java

public class MainActivity extends AppCompatActivity {

    private JsoupBean jsoupBean;
    @Override
    protected void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.activity_main);

        //抓取后的数据放到这个bean中
        jsoupBean = new JsoupBean();

        //这里需要放在子线程中完成,否则报这个错android.os.NetworkOnMainThreadException
        new Thread(new Runnable() {
            @Override
            public void run() {
                jsoupData();
            }
        }).start();

        findViewById(R.id.test).setOnClickListener(new View.OnClickListener() {
            @Override
            public void onClick(View view) {
                //抓完后打印一下logo,看看有没有把数据存到bean中
                Log.e("wwww",jsoupBean.toString();
                
            }
        });
    }

    private void jsoupData() {
        //抓取的目标网址
        String url = "http://www.jcodecraeer.com";

        try {//捕捉异常

            Document document = Jsoup.connect(url).get();//这里可用get也可以post方式,具体区别请自行了解

            //=======================logo    这个是泡在网上的日子的logo================
            //jsoupBean.setLogoImg();将数据放到bean的集合中list,其他也雷同,下面不做解释了
            //document.select("a.logo-t>img").attr("src")查找a标签class="logo-t"下的子标签img的属性src的值
            //document.select("a.logo-t").text()查找a标签class="logo-t"包含的内容

            jsoupBean.setLogoImg(url + document.select("a.logo-t>img").attr("src"));//select的api的详细用法请查看官方文档,这里也做简单说明
            jsoupBean.setLogoUrl(url);
            jsoupBean.setLogoName(document.select("a.logo-t").text());


            //======================导航栏1================

            Elements nv1_elements_list = document.select("ul.nav-ul>li");//查找class="nav-ul"的ul下的所有li,这里得到的是一个Elements数据
            List Nv1_NameList = new ArrayList<>();
            List Nv1_UrlList = new ArrayList<>();

            for (Element element : nv1_elements_list) {//遍历数组
                //                Log.e("wwww",element.select("a").attr("href"));
                //                Log.e("wwww",element.select("a").text());

                Nv1_NameList.add(element.select("a").text());//查找element下的a标签的内容
                if (element.select("a").attr("href").equals("/")) {//查找element下的a标签的href属性值
                    Nv1_UrlList.add(url);//由于 泡在网上的日子返回的是“/”,这里判断了一下再添加到Nv1_UrlList中
                } else {
                    Nv1_UrlList.add(url + element.select("a").attr("href"));
                }
            }
            jsoupBean.setNv1_NameList(Nv1_NameList);
            jsoupBean.setNv1_UrlList(Nv1_UrlList);
            //============导航栏1-1===================
            //            Log.e("wwww",document.select("a.lg_app").attr("href"));
            //            Log.e("wwww",document.select("a.lg_app").text());

            List Nv1_1_NameList = new ArrayList<>();
            List Nv1_1_UrlList = new ArrayList<>();

            Nv1_1_NameList.add(0, url + document.select("a.lg_app").attr("href"));//a.lg_app  :标签.class的值      attr("href")    :属性href的值
            Nv1_1_UrlList.add(0, document.select("a.lg_app").text());//text()    :标签内的值

            //            Log.e("wwww",document.select("div.search_cont>form").attr("action"));
            //            Log.e("wwww",document.select("input.in_search").attr("value"));

            Nv1_1_NameList.add(1, document.select("div.search_cont>form").attr("action"));//>form  :直接子标签form
            Nv1_1_UrlList.add(1, document.select("input.in_search").attr("value"));

            //first()   get(1)    :由于查出来的是一个Elements(数组),所以这两个表示数组的0 1下标对应的值
            //            Log.e("wwww",document.select("div#login_info>a").get(1).toString());

            Elements nv_1_1_elements = document.select("div#login_info>a");//标签#id
            for (Element element : nv_1_1_elements) {//循环遍历数组nv_1_1_elements
                //                Log.e("wwww",element.attr("href"));
                //                Log.e("wwww",element.text());

                Nv1_1_NameList.add(url + element.attr("href"));
                Nv1_1_UrlList.add(element.text());
            }
            jsoupBean.setNv1_1_NameList(Nv1_1_NameList);
            jsoupBean.setNv1_1_UrlList(Nv1_1_UrlList);

            //============导航栏2===========================
            Elements nv_2_elements = document.select("ul#nav>li");
            List Nv2_NameList = new ArrayList<>();
            List Nv2_UrlList = new ArrayList<>();

            for (Element element : nv_2_elements) {//循环遍历数组nv_2_elements
                //                Log.e("wwww",element.select("a").attr("href"));
                //                Log.e("wwww",element.select("a").text());

                Nv2_NameList.add(element.select("a").text());
                Nv2_UrlList.add(url + element.select("a").attr("href"));
            }
            jsoupBean.setNv2_NameList(Nv2_NameList);
            jsoupBean.setNv2_UrlList(Nv2_UrlList);
            //=================advertigical广告     抓取不到数据,先不理,明天有空问一下公司的网页前段大神为什么,我猜应该是js注入========================
            Elements advert_elements = document.select("div.col-md-6");
            for (Element element : advert_elements) {//循环遍历数组advert_elements

                //                Log.e("wwww",element.select("a").attr("href"));
                //                Log.e("wwww",element.select("a").toString());
                //                Log.e("wwww",element.select("a>img[src$=.png]").toString());

            }
            //===============banner============================

            Elements banner_elements = document.select("div.item");

            List banner_ContentList = new ArrayList<>();
            List banner_UrlList = new ArrayList<>();
            List banner_ImgList = new ArrayList<>();

            for (Element element : banner_elements) {//循环遍历数组banner_elements
                //                Log.e("wwww", element.select("a").attr("href"));
                //                Log.e("wwww", element.select("h3").text());
                //                Log.e("wwww", element.select("a>img").attr("src"));

                banner_ContentList.add(element.select("h3").text());
                banner_UrlList.add(element.select("a").attr("href"));
                banner_ImgList.add(url + element.select("a>img").attr("src"));
            }
            jsoupBean.setBanner_ContentList(banner_ContentList);
            jsoupBean.setBanner_UrlList(banner_UrlList);
            jsoupBean.setBanner_ImgList(banner_ImgList);
            //====================content_1========================
            Elements content1_elements = document.select("ul.arclist>li");
            List content1_UrlList = new ArrayList<>();
            List content1_ContentList = new ArrayList<>();

            for (Element element : content1_elements) {//循环遍历数组
                //                Log.e("wwww", element.select("a").attr("href"));
                //                Log.e("wwww", element.select("a").text());

                content1_UrlList.add(element.select("a").attr("href"));
                content1_ContentList.add(element.select("a").text());
            }
            jsoupBean.setContent1_ContentList(content1_ContentList);
            jsoupBean.setContent1_UrlList(content1_UrlList);
            //================Rank  排行榜=======================
            Elements rank_elements = document.select("ul.nav>li");
            List rank_UrlList = new ArrayList<>();
            List rank_ContentList = new ArrayList<>();

            for (Element element : rank_elements) {//循环遍历数组
                Log.e("wwww", element.select("a").attr("href"));
                Log.e("wwww", element.select("a").text());

                rank_UrlList.add(element.select("a").attr("href"));
                rank_ContentList.add(element.select("a").text());
            }
            jsoupBean.setRank_ContentList(rank_ContentList);
            jsoupBean.setRank_UrlList(rank_UrlList);

        } catch (Exception e) {
            Log.e("wwwwwwwww==", e.toString());
        }
    }
}

JsoupBean.java 数据bean

public class JsoupBean {
    //logo
    private String logoUrl;
    private String logoName;
    private String logoImg;
    //最上面左边的导航栏
    private List nv1_NameList;
    private List nv1_UrlList;
    //最上面右边的导航栏
    private List nv1_1_NameList;
    private List nv1_1_UrlList;
    //导航栏2
    private List nv2_NameList;
    private List nv2_UrlList;
    //广告栏
    private List advert_Img_List;
    private List advert_Url_List;
    //banner
    private List banner_ContentList;
    private List banner_UrlList;
    private List banner_ImgList;
    //内容1   banner旁边的
    private List content1_UrlList;
    private List content1_ContentList;
    //排行榜标题
    private List rank_UrlList;
    private List rank_ContentList;

    public List getRank_UrlList() {
        return rank_UrlList;
    }

    public void setRank_UrlList(List rank_UrlList) {
        this.rank_UrlList = rank_UrlList;
    }

    public List getRank_ContentList() {
        return rank_ContentList;
    }

    public void setRank_ContentList(List rank_ContentList) {
        this.rank_ContentList = rank_ContentList;
    }

    public List getContent1_UrlList() {
        return content1_UrlList;
    }

    public void setContent1_UrlList(List content1_UrlList) {
        this.content1_UrlList = content1_UrlList;
    }

    public List getContent1_ContentList() {
        return content1_ContentList;
    }

    public void setContent1_ContentList(List content1_ContentList) {
        this.content1_ContentList = content1_ContentList;
    }

    public List getAdvert_Img_List() {
        return advert_Img_List;
    }

    public void setAdvert_Img_List(List advert_Img_List) {
        this.advert_Img_List = advert_Img_List;
    }

    public List getBanner_ContentList() {
        return banner_ContentList;
    }

    public void setBanner_ContentList(List banner_ContentList) {
        this.banner_ContentList = banner_ContentList;
    }

    public List getBanner_UrlList() {
        return banner_UrlList;
    }

    public void setBanner_UrlList(List banner_UrlList) {
        this.banner_UrlList = banner_UrlList;
    }

    public List getBanner_ImgList() {
        return banner_ImgList;
    }

    public void setBanner_ImgList(List banner_ImgList) {
        this.banner_ImgList = banner_ImgList;
    }

    public List getAdvert_Name_List() {
        return advert_Img_List;
    }

    public void setAdvert_Name_List(List advert_Img_List) {
        this.advert_Img_List = advert_Img_List;
    }

    public List getAdvert_Url_List() {
        return advert_Url_List;
    }

    public void setAdvert_Url_List(List advert_Url_List) {
        this.advert_Url_List = advert_Url_List;
    }

    public List getNv2_NameList() {
        return nv2_NameList;
    }

    public void setNv2_NameList(List nv2_NameList) {
        this.nv2_NameList = nv2_NameList;
    }

    public List getNv2_UrlList() {
        return nv2_UrlList;
    }

    public void setNv2_UrlList(List nv2_UrlList) {
        this.nv2_UrlList = nv2_UrlList;
    }

    public List getNv1_1_NameList() {
        return nv1_1_NameList;
    }

    public void setNv1_1_NameList(List nv1_1_NameList) {
        this.nv1_1_NameList = nv1_1_NameList;
    }

    public List getNv1_1_UrlList() {
        return nv1_1_UrlList;
    }

    public void setNv1_1_UrlList(List nv1_1_UrlList) {
        this.nv1_1_UrlList = nv1_1_UrlList;
    }

    public List getNv1_NameList() {
        return nv1_NameList;
    }

    public void setNv1_NameList(List nv1_NameList) {
        this.nv1_NameList = nv1_NameList;
    }

    public List getNv1_UrlList() {
        return nv1_UrlList;
    }

    public void setNv1_UrlList(List nv1_UrlList) {
        this.nv1_UrlList = nv1_UrlList;
    }

    public String getLogoUrl() {
        return logoUrl;
    }

    public void setLogoUrl(String logoUrl) {
        this.logoUrl = logoUrl;
    }

    public String getLogoName() {
        return logoName;
    }

    public void setLogoName(String logoName) {
        this.logoName = logoName;
    }

    public String getLogoImg() {
        return logoImg;
    }

    public void setLogoImg(String logoImg) {
        this.logoImg = logoImg;
    }

    @Override
    public String toString() {
        return "JsoupBean{" +
                "logoUrl='" + logoUrl + '\'' +
                ", logoName='" + logoName + '\'' +
                ", logoImg='" + logoImg + '\'' +
                ", nv1_NameList=" + nv1_NameList +
                ", nv1_UrlList=" + nv1_UrlList +
                ", nv1_1_NameList=" + nv1_1_NameList +
                ", nv1_1_UrlList=" + nv1_1_UrlList +
                ", nv2_NameList=" + nv2_NameList +
                ", nv2_UrlList=" + nv2_UrlList +
                ", advert_Img_List=" + advert_Img_List +
                ", advert_Url_List=" + advert_Url_List +
                ", banner_ContentList=" + banner_ContentList +
                ", banner_UrlList=" + banner_UrlList +
                ", banner_ImgList=" + banner_ImgList +
                ", content1_UrlList=" + content1_UrlList +
                ", content1_ContentList=" + content1_ContentList +
                ", rank_UrlList=" + rank_UrlList +
                ", rank_ContentList=" + rank_ContentList +
                '}';
    }
}
XML界面就不上了,就一个button点击按钮,点击打印log

本文章仅供学习之用,禁止任何商业用途,若有所需或转载请与作者联系

你可能感兴趣的:(笔记:Android用jsoup抓取网页HTML解析数据)