Python爬虫实战(二):爬取天涯帖子(只看楼主)

先上代码


#coding=utf-8
import requests
from bs4 import Tag
from bs4 import BeautifulSoup


def getHtml(url):
    page = requests.get(url)
    html =page.text
    return html

def getText(html):
    get_text = Tag.get_text
    soup = BeautifulSoup(html, 'html.parser')
    
    author_info = soup.find_all('div', class_='atl-info')
    listauthor  = [x.get_text() for x in author_info]
        
    list_info = soup.find_all('div', class_='bbs-content')
    listtext  = [x.get_text() for x in list_info]

    global i
    if i > 1:
        listtext = [""] + listtext
    
    for x in range(len(listauthor)):
        if "楼主" in listauthor[x]:
            print (listtext[x].strip())
            
if __name__=='__main__':
    for i in range(1,6):
        url  = ("http://bbs.tianya.cn/post-feeling-4286798-%s.shtml" % str(i))
        html = getHtml(url)
        getText(html)





  



刚学Python不到一个月,代码写的有点乱,以后优化。



你可能感兴趣的:(Python,爬虫)