多线程 Q群 号码爬虫

通过空间历史浏览,爬出查看你空间的人(一般限制20人,除非开通黄钻),然后在爬出这20人的浏览记录,依次向下爬,你可以控制爬行深度。
这里仅仅给出怕中代码片段,你可以进一步优化,将Q群分类存储。通过Q群相互浏览关系,可以通过绘图工具绘制好友网络。等等
欢迎跟过讨论,请加Q群注明“读者”

代码涉及pthreads 如果不清楚请阅读:《PHP 高级编程之多线程》
http://netkiller.github.io/journal/thread.php.html
 
标签: pthreads  PHP
 

代码片段(1)[全屏查看所有代码]

1. [代码][PHP]代码     

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
<?php
/*
Homepage: http://netkiller.github.io
Author: Neo <netkiller@msn.com>
*/
if (! extension_loaded ( 'pthreads' )) die ( 'Please install pthreads' );
 
include_once ( 'Snoopy.class.php' );
 
class CrawlerWorker extends Worker {
 
     protected  static $dbh ;
     public function __construct() {
 
     }
     public function run(){
     /*
         $dbhost = 'db.example.com';         // 数据库服务器
         $dbuser = 'example.com';            // 数据库用户名
         $dbpw = 'password';                 // 数据库密码
         $dbname = 'example';                // 数据库名
 
         self::$dbh  = new PDO("mysql:host=$dbhost;port=3306;dbname=$dbname", $dbuser, $dbpw, array(
             PDO::MYSQL_ATTR_INIT_COMMAND => 'SET NAMES \'UTF8\'',
             PDO::MYSQL_ATTR_COMPRESS => true,
             PDO::ATTR_PERSISTENT => true
             )
         );
     */
     }
     protected function getInstance(){
         return self:: $dbh ;
     }
 
}
 
/* the collectable class implements machinery for Pool::collect */
class Crawler extends Stackable {
     public $depth = 3;
     private static $level = 0;
     public function __construct( $qq ) {
         $this ->qq = $qq ;
     }
     public function run() {
 
         try {
             $dbh  = $this ->worker->getInstance();
             $this ->recursion( array ( $this ->qq));
         }
         catch (PDOException $e ) {
             $error = sprintf( "%s,%s\n" , $mobile , $id );
             file_put_contents ( "mobile_error.log" , $error , FILE_APPEND);
         }
         //printf("runtime: %s, %s\n", date('Y-m-d H:i:s'), $this->worker->getThreadId());
         //$lst = $this->qzone($this->qq);
         //print_r($lst);
     }
     public function recursion( $qqs ){
         
         if ( self:: $level <= $this ->depth){
             self:: $level ++;
         } else if (self:: $level > 0){
             self:: $level --;
         }
         printf( "Level: %s\n" , self:: $level );
         //sleep(1);
         usleep(mt_rand(10000,1000000));
         if (self:: $level >= $this ->depth){
             return ;
         }
         
         foreach ( $qqs as $uin ) {
             $lst = $this ->qzone( $uin );
             print_r( $lst );
             $this ->recursion( $lst );
         }
     }
 
     public function qzone( $qq ){
         $url = 'http://m.qzone.com/mqz_get_visitor?g_tk=1191852101&res_mode=0&res_uin=' . $qq . '&offset=0&count=100&page=1&format=json&t=1401762986882&sid=dODKVcYv6azjN87cxXQ5mao1xgakYjHg18c8aa5e0201%3D%3D' ;
         $snoopy = new Snoopy;
          
         // need an proxy?
         //$snoopy->proxy_host = "my.proxy.host";
         //$snoopy->proxy_port = "8080";
          
         // set browser and referer:
         $snoopy ->agent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)" ;
         $snoopy ->referer = "http://m.qzone.com/" ;
          
         // set some cookies:
         //$snoopy->cookies["SessionID"] = '238472834723489';
         //$snoopy->cookies["favoriteColor"] = "blue";
          
         // set an raw-header:
         $snoopy ->rawheaders[ "Pragma" ] = "no-cache" ;
          
         // set some internal variables:
         $snoopy ->maxredirs = 2;
         $snoopy ->offsiteok = false;
         $snoopy ->expandlinks = false;
          
         // set username and password (optional)
         //$snoopy->user = "joe";
         //$snoopy->pass = "bloe";
          
         // fetch the text of the website www.google.com:
         if ( $snoopy ->fetchtext( $url )){
             // other methods: fetch, fetchform, fetchlinks, submittext and submitlinks
 
             // response code:
             //print "response code: ".$snoopy->response_code."<br/>\n";
          
             // print the headers:
             //print "<b>Headers:</b><br/>";
             //while(list($key,$val) = each($snoopy->headers)){
             //  print $key.": ".$val."<br/>\n";
             //}
 
             // print the texts of the website:
             //print_r( json_decode($snoopy->results) );
             
             $results = array ();
             $tmp = json_decode( $snoopy ->results);
             
             if ( $tmp ){
                 if (property_exists( $tmp , 'data' )){
                     foreach ( $tmp ->data->list as $lst ){
                         $results [] = $lst ->uin;
                     }
                 }
             }
             return ( $results );
             
         }
         else {
             print "Snoopy: error while fetching document: " . $snoopy ->error. "\n" ;
         }      
     }
}
 
$pool = new Pool(100, \CrawlerWorker:: class , []);
 
# foreach (range(1000, 100000) as $number ) {
#   $pool ->submit( new Crawler( $number ));
#}
 
$pool ->submit( new Crawler( '13721218' ));
$pool ->submit( new Crawler( '291379' ));
//$pool->submit(new Crawler('xxx'));
//$pool->submit(new Crawler('xxx'));
//$pool->submit(new Crawler('xxx'));
// 以此类推
//$pool->submit(new Crawler('nnn'));
 
$pool ->shutdown();
?>

你可能感兴趣的:(PHP,netkiller,pthreads)