燕曦版友信息统计脚本

connector.py
import  urllib, urllib2, cookielib

class  MyConnector:
    
def   __init__ (self):
        
pass
    
    
def  login(self, url):
        cookie 
=  cookielib.CookieJar()
        opener 
=  urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
        urllib2.install_opener(opener)
        str 
=  urllib.urlencode({ ' id ' ' guest ' ' passwd ' '' })
        self.sock 
=  urllib2.urlopen(url, str)
    
    
def  getHTML(self, url):
        self.sock 
=  urllib2.urlopen(url)
        
return  self.sock.read()

yanxiparser.py
from  sgmllib  import  SGMLParser
import  re

class  YanxiURLParser(SGMLParser):
    
def  reset(self):
        self.result 
=  []
        SGMLParser.reset(self)
    
    
def  start_a(self, attrs):
        
for  (k, v)  in  attrs:
            
if  (k  ==   ' href '   and  (v.find( ' bbsanc ' >=  0)):
                self.result.append(v)
                
class  YanxiHTMLParser:
    
def  parse(self, html):
        uid 
=  ufrom  =  ubirth  =  ufav  =   ''
        
        html 
=  html.replace(r '   ' '   ' )
        html 
=  html.replace(r ' <br /> ' '' )
        
        pattern 
=   ' \xbe\xcd\xca\xc7(.*)\xc0\xb2 '
        matchObject 
=  re.search(pattern, html)
        uid 
=  matchObject.group( 1 )
        uid 
=  uid.strip()
        
        pattern 
=   ' \xc0\xb4\xd7\xd4(.*)\xa3(\xac|xa1) '
        matchObject 
=  re.search(pattern, html)
        ufrom 
=  matchObject.group( 1 )
        ufrom 
=  ufrom.strip()
        
        pattern 
=   ' \xcf\xb2\xbb\xb6(.*)\n '
        matchObject 
=  re.search(pattern, html)
        ufav 
=  matchObject.group( 1 )
        ufav 
=  ufav.strip()
        
        pattern 
=   ' \n(.*)\xca\xc7\xce\xd2\xb5\xc4\xc9\xfa\xc8\xd5 '
        matchObject 
=  re.search(pattern, html)
        ubirth 
=  matchObject.group( 1 )
        ubirth 
=  ubirth.strip()
        
return  { " id "  : uid,  " from "  : ufrom,  " birth "  : ubirth,  " fav "  : ufav}

runner.py
from  connector  import  MyConnector
from  yanxiparser  import   *

rootURL 
=   ' http://yanxibbs.cn '
loginURL 
=   ' http://yanxibbs.cn/bbslogin.php '
url1 
=   ' http://yanxibbs.cn/cgi-bin/bbs/bbs0an?path=%2Fgroups%2FGROUP%5F3%2F06SS%2Fbyxx%2Fbjcy '
url2 
=   ' http://yanxibbs.cn/cgi-bin/bbs/bbs0an?path=%2Fgroups%2FGROUP%5F3%2F06SS%2Fbyxx%2Fbjyr '

conn 
=  MyConnector()
conn.login(loginURL)

def  printInfo(url):
    html 
=  conn.getHTML(url)
    urlParser 
=  YanxiURLParser()
    htmlParser 
=  YanxiHTMLParser()
    urlParser.feed(html)
    
    
for  targetURL  in  urlParser.result:
        html 
=  conn.getHTML(rootURL  +  targetURL)
        info 
=  htmlParser.parse(html)
        
print   " %(id)s\t%(from)s\t%(birth)s\t%(fav)s "   %  info
    
printInfo(url1)
printInfo(url2)

你可能感兴趣的:(燕曦版友信息统计脚本)