Today, I finished the connection work from Python to MongDB. It uses the module pymongo for purpose.
I'll give the codes directly, because it's very cold here, and I gotto go now...
import re import json import codecs import urllib.request from bs4 import BeautifulSoup from pymongo import MongoClient def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html def New(link, dep, son, far): global col url = { 'link': link, 'dep': dep, 'son': son, 'far': far } urllist.append(url) """print(json.dumps(url, sort_keys=True, indent=4, separators=(',', ': ')))""" col.insert(url) def PrintSoup(soup): file = codecs.open('soup.txt', 'w+', 'utf-8') file.write(soup.prettify()) file.close() def Develop(point): url = urllist[point] link = url.get('link') html = getHtml(link) soup = BeautifulSoup(html, 'html.parser') for ai in soup.find_all('a'): href = str(ai.get('href')) if re.match(r'^https?://.+$', href): New(href, url.get('dep') + 1, 0, url.get('link')) url['son'] += 1 client = MongoClient('localhost', 27017) db = client.Links col = db.Links point = 0 urllist = [] New("https://image.baidu.com", 1, 0, "") while len(urllist) < 3000: if point >= len(urllist): break Develop(point) point = point + 1