一个简单的python爬虫

如果想爬百度百科、知乎这一类的,首先要加一个文件头。然后使用requests.get时一起交过去。

# CrawUniRanking.py

import requests
import urllib3
from bs4 import BeautifulSoup

allUniv = []

def getHTMLText(url):
    try:
        header = {"User-Agent": "Mozilla/5.0 (W\
        ndows NT 10.0; Win64; x64) AppleWebKit/537.\
        36 (KHTML, like Gecko) Chrome/51.0.2704.106 Sa\
        fari/537.36"}
        #headers用于欺骗网站
        r = requests.get(url, headers=header)
        r.raise_for_status()
        r.encoding = 'utf-8'
        return r.text

    except:
        return ""

def fillUnivList(soup):
    data = soup.find_all('tr')
    for tr in data:
        ltd = tr.find_all('td')
        if len(ltd) == 0:
            continue
        singleUniv = []
        for td in ltd:
            singleUniv.append(td.string)
        allUniv.append(singleUniv)

def printUnivList(num):

    for i in range(num):
        u = allUniv[i]
        print("{1:^2} {2:{0}^20} {3:{0}^30} {4:{0}^20} {5:{0}^10} {6:{0}^10}".format(chr(12288), u[0], u[1], u[2], u[3], eval(u[4]), u[5])

def main(num):
    url='https://baike.baidu.com/item/%E4%B8%96%E7%95%8C100%E6%89%80%E8%91%97%E5%90%8D%E5%A4%A7%E5%AD%A6%E6%8E%92%E8%A1%8C%E6%A6%9C/12680287?fr=aladdin'
    html = getHTMLText(url)
    soup = BeautifulSoup(html, "html.parser")
    fillUnivList(soup)
    printUnivList(num)

main(10)

 

你可能感兴趣的:(爬虫)