多线程快速抓取网页

一段简单的代码,用于抓取wiki百科数据,简单的多线程编程例子,很少占内存,线程数开大了后效率很高。
import sys, thread, threading, time;
import commands

finish_num = 0;
mutex = threading.Lock();

def extract_qid(id, num_of_thread):
    try:
        fin = open(sys.argv[1], "r");
        fout = open(sys.argv[2] + ".part" + str(id), "w");
        count = 0;
        for line in fin:
            try:
                line = line.strip();
                if count % num_of_thread != id:
                    count += 1;
                    continue;
                count += 1;
                _raw_query = line;
		cmd = "wget \"zh.wikipedia.org/zh-hans/${query}\" -O \"fetch_wiki/tmp_search_${id}\""
                cmd = cmd.replace("${query}", _raw_query).replace("${id}", str(id));
                commands.getoutput(cmd);

                tmp_fin = open("fetch_wiki/tmp_search_${id}".replace("${id}", str(id)), "r");
		fout.write("zh.wikipedia.org/zh-hans/${query}\n".replace("${query}",_raw_query))
                for tmp_line in tmp_fin:
                    fout.write(tmp_line)
                tmp_fin.close();

                commands.getoutput("rm -f \"fetch_wiki/tmp_search_${id}\"".replace("${id}", str(id)));
            except:
                continue;
        fout.close();
        fin.close();

        global finish_num;
        if mutex.acquire(1):
            finish_num += 1;
            mutex.release();

        return True;
    except Exception as e:
        print e;
        return False;

for i in range(0, int(sys.argv[3])):
    thread.start_new_thread(extract_qid, (i, int(sys.argv[3])));

while finish_num != int(sys.argv[3]):
    time.sleep(1);

你可能感兴趣的:(多线程编程)