第一次爬取的电子书

import requests

from bs4 import BeautifulSoup
import codecs
def get_url(url):
html=requests.get(url)
soup=BeautifulSoup(html.content,”lxml”)
url_list=[]
print(1)
list=soup.select(“.inner li a”)
print(list)
for x in list:
x=x.get(“href”)
url_list.append(x)
return url_list
def get_data(url):
html=requests.get(url)
soup=BeautifulSoup(html.content,”lxml”)
fo = codecs.open(‘output.txt’, ‘a+’, ‘utf-8’)

chapter_name=soup.select("#BookText .calibre_14")
for y in chapter_name:
    y=y.text
    fo.write(y+"\r\n")
chapter_con=soup.select(".calibre_15")
for x in chapter_con:
    x=x.text
    fo.write("\r\n"+x+"\r\n")
fo.write("\r\n" + "********************" + "\r\n")
fo.close()

if name == ‘main‘:
url=”http://www.yuedu88.com/TheArtofFielding/”
url_list=get_url(url)
for x in url_list:
get_data(x)

你可能感兴趣的:(python基础,爬虫)