Python实战抓取捧腹网笑话数据

BeautifulSoup 解析html

# 抓取捧腹网笑话数据

# https://www.pengfu.com/xiaohua_1.html

import urllib

import urllib2

from bs4 import BeautifulSoup

import os

import codecs

import sys

status = True

def remvoeRNT(text):

text1 = text.replace("\t","").replace("\r","").replace("\n","")

return text1

def openFile(title,message):

estabish()

file = codecs.open('/Users/fukai/Desktop/捧腹网数据.txt', 'a','utf-8')

file.write(title + "\n")

file.write(message + "\n")

file.close()

def estabish():

status = os.path.exists('/Users/fukai/Desktop/捧腹网数据.txt')

if status == False:

os.msnod("/Users/fukai/Desktop/捧腹网数据.txt")

index = 0

page = 0

while (status):

response = urllib2.urlopen("https://www.pengfu.com/xiaohua_%d.html" % index)

html = response.read()

soup = BeautifulSoup(html, 'html.parser')

messages = soup.find_all("div",class_="content-img clearfix pt10 relative")

h1s = soup.find_all("h1",class_="dp-b")

tagNum = soup.find("a",class_="page-a page-04")

if tagNum == None:

tagNum = soup.find("span", class_="on")

try:

tagNumText = tagNum.text

print str(index) + "  " + tagNumText

page = int(tagNumText)

except:

print tagNum

for num in range(0,len(messages)):

tag = messages[num]

tag1 = h1s[num]

title = remvoeRNT(tag1.text)

message = remvoeRNT(tag.text)

# message = message.replace("\r","")

# message = message.replace("\n","")

openFile(title,message)

if index >= page:

status = False

else:

index = index + 1

print "结束网页解析"

你可能感兴趣的:(Python实战抓取捧腹网笑话数据)