python网络蜘蛛

python编写的简单的网络蜘蛛,使用了chardet库来解码,其中的关键技术为多线程控制时间、正则表达式解析html

import re
import urllib.request
import chardet
import _thread
import time


def time_control():
    global TIME_C
    while 1:
        time.sleep(1)
        if time.time() - TIME_C > 10:
            TIME_C = time.time()
            _thread.interrupt_main()
def put_in(from_url):
    global S, url_list, TIME_C
    try:
        TIME_C = time.time()
        temp=urllib.request.urlopen(from_url)
        if temp.status == 200:
            ts = temp.read()
            TIME_C = time.time()
            S = ts.decode(chardet.detect(ts)['encoding'], 'replace')
            pattern = re.compile(r'"https?://[^\b\n<>"]*?pku.edu.cn')
            NUM = 0
            TIME_C = time.time()
            g = pattern.search(S[NUM:])
            while True:
                TIME_C = time.time()
                g = pattern.search(S[NUM:len(S)])
                if g == None:
                    break
                NUM += g.span()[1]
                if url_list.count(g.group(0)[1:len(g.group(0))]) == 0:
                    url_list.append(g.group(0)[1:len(g.group(0))])
                    print(g.group(0)[1:len(g.group(0))])
    except KeyboardInterrupt:
        TIME_C = time.time()
        print("\nTime out\n")
        return
    except TimeoutError:
        print("\nTime out\n")
        return
    except urllib.error.URLError:
        return


def ergodic_list(start_point):
    global url_list
    new_start_point = len(url_list)
    for i in url_list[start_point:]:
        print("\nfrom "+i+"\n"+"-"*10)
        put_in(i)
    if len(url_list) > new_start_point:
        ergodic_list(new_start_point)


if __name__ == '__main__':
    TIME_C = time.time()
    _thread.start_new_thread(time_control,())
    url_list = ['http://www.pku.edu.cn/sitemap/bzdt.html']
    S = ''
    ergodic_list(0)
 

你可能感兴趣的:(多线程,python,网络,正则表达式)