爬虫练习题

from urllib import request
import re
headers={‘User-Agent’:‘zsjggye’}
def getcatelogs(url):
req=request.Request(url=url, headers=headers,method=“GET”)
response=request.urlopen(req)
result=[]
if response.status200:
html=response.read().decode(‘UTF-8’)
alist=re.findall(’

  • .
  • ’,html)
    for a in alist:
    g=re.search('href="([ >"])*"[\s]*title="([>"])"’,a)
    if g !=None:
    url=‘http://www.doupoxs.com’+g.group(1)
    title=g.group(2)
    chapter={‘title’:title,‘url’:url}
    result.append(chapter)
    return result
    def getchapterContent(chapters):
    for chapter in chapters:
    req=request.Request(url=chapter[‘url’],headers=headers,method= “GET”)
    response=request.urlopen(req)
    if response.status200:
    f=open(‘novel/’+chapter[‘title’]+’.txt’,‘a+’)
    contents=re.findall(’

    (.*?) for content in contents:
    f.write(content+’\n’)
    f.close()
    print(chapter[‘title’],chapter[‘url’])
    getchapterContent(getcatelogs(‘http://www.doupoxs.com/nalanwudi’))

    爬虫练习题_第1张图片

你可能感兴趣的:(笔记,python,爬虫)