Python 爬虫学习笔记(四)

python 爬虫学习笔记(四)

【Python网络爬虫与信息提取】.MOOC. 北京理工大学

  • 中国大学排名定向爬虫

    #视频33:中国大学排名定向爬虫实例
    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    def getHTMLtext(url): //获取大学排名网页内容
        try:
            r = requests.get(url, timeout=30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    
    def fillUnivlist(ulist,html): //提取网页信息到合适数据结构中
        soup = BeautifulSoup(html,"html.parser")
        for tr in soup.find('tbody').children:#只有一个tbody,故用find
            if isinstance(tr, bs4.element.Tag):
                tds = tr('td')
                ulist.append([tds[0].string, tds[1].string, tds[3].string])
    
    def printUnivList(ulist, num): //利用数据结构展示并输出结果
        print("{:^10}\t{:^6}\t{:^10})".format("排名", "学校", "得分"))#格式化输出
        for i in range(num):
            u = ulist[i]
            print("{:^10}\t{:^6}\t{:^10}".format(u[0], u[1], u[2]))
    
    def main():
        uinfo=[] //将大学信息放入unifo中
        url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html"
        html = getHTMLtext(url) 
        fillUnivlist(uinfo, html)
        printUnivList(uinfo, 20)
    main()
    
  • 中国大学排名爬虫实例优化

    #视频34:中国大学排名定向爬虫实例优化
    import requests
    from bs4 import BeautifulSoup
    import bs4
    
    
    def getHTMLText(url):
        try:
            r = requests.get(url,timeout = 30)
            r.raise_for_status()
            r.encoding = r.apparent_encoding
            return r.text
        except:
            return ""
    
    def fillUnivList(ulist,html):
        soup = BeautifulSoup(html,"html.parser")
        for tr in soup.find('tbody').children:
            if isinstance(tr,bs4.element.Tag):
                tds = tr('td')
                ulist.append([tds[0].string,tds[1].string,tds[3].string])
    
    def printUnivList(ulist,num):
    
        tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
    
        print(tplt.format("排名","名称","总分",chr(12288)))   
        for i in range(num):
            u = ulist[i]
            print(tplt.format(u[0],u[1],u[2],chr(12288)))
    
    
    
    def main():
        uinfo = []
        url = "http://www.zuihaodaxue.com/zuihaodaxuepaiming2018.html"
        html = getHTMLText(url)
        fillUnivList(uinfo,html)
    
        printUnivList(uinfo,20)
    
    main()
    
    
    

你可能感兴趣的:(Python,爬虫)