python的正则与网页操作练习二:


import re
import urllib.request

#51cto urlcode=gb18030

class down51web:
s_url=''
s_blogid=''
s_blogpages=''
s_html=''
s_code=''
def __init__(self,url,code):
self.s_url=url
self.s_code=code


def get_html(self):
self.s_html=urllib.request.urlopen(self.s_url).read().decode(self.s_code)
return self.s_html

def get_page(self,r_page):
if len(self.s_html) > 0:
m_pages=r_page.search(self.s_html)
if m_pages:
self.s_blogpages=m_pages.group(1)
return self.s_blogpages

def get_blogid(self,r_blogid):
if len(self.s_html) > 0:
m_blogid=r_blogid.search(self.s_html)
if m_blogid:
self.s_blogid=m_blogid.group(1).split('/')[1]
return self.s_blogid

def get_blogpagelist(self):
bloglist=[]
if len(self.s_blogid)>0 and len(self.s_blogpages)>0:
for i in range(1,int(self.s_blogpages)+1):
bloglist.append(self.s_url+'/'+self.s_blogid+'/p-'+str(i))
return bloglist

def get_pagelist(self,r_list,url):
bloglist=[]
self.s_url=url
s_tmphtml=self.get_html()
if len(s_tmphtml) > 0:
bloglist=r_list.findall(s_tmphtml)
return bloglist






r_page=re.compile('页数 \( [0-9]+/([0-9]+) \)(?=
)') r_blogid=re.compile('(?<=)]*)>') r_list=re.compile('([^<]*)') xx='http://hxw168.blog.51cto.com' hxw=down51web(xx,'gb18030') hxw.get_html() #print(hxw.s_html) print(hxw.get_page(r_page)) print(hxw.get_blogid(r_blogid)) list=hxw.get_blogpagelist() for i in list: l1=hxw.get_pagelist(r_list,i) for x in l1: print(xx+x[0]+'-------'+x[1])

代码排版不正常



import re
import urllib.request
#51cto urlcode=gb18030
class down51web:
    s_url=''
    s_blogid=''
    s_blogpages=''
    s_html=''
    s_code=''
    def __init__(self,url,code):
        self.s_url=url
        self.s_code=code
    
    def get_html(self):
        self.s_html=urllib.request.urlopen(self.s_url).read().decode(self.s_code)
        return self.s_html
    
    def get_page(self,r_page):
        if len(self.s_html) > 0:
            m_pages=r_page.search(self.s_html)
            if m_pages:
                self.s_blogpages=m_pages.group(1)
        return self.s_blogpages
     def get_blogid(self,r_blogid):
        if len(self.s_html) > 0:
            m_blogid=r_blogid.search(self.s_html)
            if m_blogid:
                self.s_blogid=m_blogid.group(1).split('/')[1]
        return self.s_blogid
    
    def get_blogpagelist(self):
        bloglist=[]
        if len(self.s_blogid)>0 and len(self.s_blogpages)>0:
            for i in range(1,int(self.s_blogpages)+1):
                bloglist.append(self.s_url+'/'+self.s_blogid+'/p-'+str(i))
        return bloglist
        
    def get_pagelist(self,r_list,url):  
        bloglist=[]
        self.s_url=url
        s_tmphtml=self.get_html()
        if len(s_tmphtml) > 0: 
            bloglist=r_list.findall(s_tmphtml)
        return bloglist 

r_page=re.compile('页数 \( [0-9]+/([0-9]+) \)(?=