from
sgmllib
import
SGMLParser
02 |
import sys,urllib2,urllib,cookielib |
03 |
class spider(SGMLParser): |
04 |
def __init__( self ,email,password): |
05 |
SGMLParser.__init__( self ) |
06 |
self .h3 = False |
07 |
self .h3_is_ready = False |
08 |
self .div = False |
09 |
self .h3_and_div = False |
10 |
self .a = False |
11 |
self .depth = 0 |
12 |
self .names = "" |
13 |
self .dic = {} |
14 |
|
15 |
self .email = email |
16 |
self .password = password |
17 |
self .domain = 'renren.com' |
18 |
try : |
19 |
cookie = cookielib.CookieJar() |
20 |
cookieProc = urllib2.HTTPCookieProcessor(cookie) |
21 |
except : |
22 |
raise |
23 |
else : |
24 |
opener = urllib2.build_opener(cookieProc) |
25 |
urllib2.install_opener(opener) |
26 |
27 |
def login( self ): |
28 |
url = 'http://www.renren.com/PLogin.do' |
29 |
postdata = { |
30 |
'email' : self .email, |
31 |
'password' : self .password, |
32 |
'domain' : self .domain |
33 |
} |
34 |
req = urllib2.Request( |
35 |
url, |
36 |
urllib.urlencode(postdata) |
37 |
) |
38 |
|
39 |
self . file = urllib2.urlopen(req).read() |
40 |
#print self.file |
41 |
def start_h3( self ,attrs): |
42 |
self .h3 = True |
43 |
def end_h3( self ): |
44 |
self .h3 = False |
45 |
self .h3_is_ready = True |
46 |
|
47 |
def start_a( self ,attrs): |
48 |
if self .h3 or self .div: |
49 |
self .a = True |
50 |
def end_a( self ): |
51 |
self .a = False |
52 |
|
53 |
def start_div( self ,attrs): |
54 |
if self .h3_is_ready = = False : |
55 |
return |
56 |
if self .div = = True : |
57 |
self .depth + = 1 |
58 |
|
59 |
for k,v in attrs: |
60 |
if k = = 'class' and v = = 'content' : |
61 |
self .div = True ; |
62 |
self .h3_and_div = True #h3 and div is connected |
63 |
def end_div( self ): |
64 |
if self .depth = = 0 : |
65 |
self .div = False |
66 |
self .h3_and_div = False |
67 |
self .h3_is_ready = False |
68 |
self .names = "" |
69 |
if self .div = = True : |
70 |
self .depth - = 1 |
71 |
def handle_data( self ,text): |
72 |
#record the name |
73 |
if self .h3 and self .a: |
74 |
self .names + = text |
75 |
#record says |
76 |
if self .h3 and ( self .a = = False ): |
77 |
if not text: pass |
78 |
else : self .dic.setdefault( self .names,[]).append(text) |
79 |
return |
80 |
if self .h3_and_div: |
81 |
self .dic.setdefault( self .names,[]).append(text) |
82 |
|
83 |
def show( self ): |
84 |
type = sys.getfilesystemencoding() |
85 |
for key in self .dic: |
86 |
print ( (' '.join(key)).replace(' ',' ')).decode(' utf - 8 ').encode( type ), \ |
87 |
( (' '.join(self.dic[key])).replace(' ',' ')).decode(' utf - 8 ').encode( type ) |
88 |
89 |
90 |
91 |
92 |
renrenspider = spider( 'your email' , 'your password' ) |
93 |
renrenspider.login() |
94 |
renrenspider.feed(renrenspider. file ) |
95 |
renrenspider.show() |