Python登陆人人网并抓取新鲜事

 


 1from sgmllib import SGMLParser
 2import sys,urllib2,urllib,cookielib
 3class spider(SGMLParser):
 4    def __init__(self,email,password):
 5        SGMLParser.__init__(self)
 6        self.h3=False
 7        self.h3_is_ready=False
 8        self.div=False
 9        self.h3_and_div=False
10        self.a=False
11        self.depth=0
12        self.names=""
13        self.dic={}   
14         
15        self.email=email
16        self.password=password
17        self.domain='renren.com'
18        try:
19            cookie=cookielib.CookieJar()
20            cookieProc=urllib2.HTTPCookieProcessor(cookie)
21        except:
22            raise
23        else:
24            opener=urllib2.build_opener(cookieProc)
25            urllib2.install_opener(opener)       
26
27    def login(self):
28        url='http://www.renren.com/PLogin.do'
29        postdata={
30                  'email':self.email,
31                  'password':self.password,
32                  'domain':self.domain  
33                  }
34        req=urllib2.Request(
35                            url,
36                            urllib.urlencode(postdata)            
37                            )
38        
39        self.file=urllib2.urlopen(req).read()
40        #print self.file
41    def start_h3(self,attrs):
42        self.h3 = True
43    def end_h3(self):
44        self.h3=False
45        self.h3_is_ready=True
46        
47    def start_a(self,attrs):
48        if self.h3 or self.div:
49            self.a=True
50    def end_a(self):
51        self.a=False
52        
53    def start_div(self,attrs):
54        if self.h3_is_ready == False:
55            return
56        if self.div==True:
57            self.depth += 1
58            
59        for k,v in attrs:
60            if k == 'class' and v == 'content':
61                self.div=True;
62                self.h3_and_div=True   #h3 and div is connected
63    def end_div(self):
64        if self.depth == 0:
65            self.div=False
66            self.h3_and_div=False
67            self.h3_is_ready=False
68            self.names=""
69        if self.div == True:
70            self.depth-=1
71    def handle_data(self,text):
72        #record the name
73        if self.h3 and self.a:
74            self.names+=text
75        #record says
76        if self.h3 and (self.a==False):
77            if not text:pass
78            else: self.dic.setdefault(self.names,[]).append(text)
79            return 
80        if self.h3_and_div:
81            self.dic.setdefault(self.names,[]).append(text)
82            
83    def show(self):
84        type = sys.getfilesystemencoding()
85        for key in self.dic:
86            print ( (''.join(key)).replace(' ','')).decode('utf-8').encode(type), \
87                  ( (''.join(self.dic[key])).replace(' ','')).decode('utf-8').encode(type)
88
89 
90
91
92renrenspider=spider('your email','your password')
93renrenspider.login()
94renrenspider.feed(renrenspider.file)
95renrenspider.show()
96

 

你可能感兴趣的:(Python登陆人人网并抓取新鲜事)