2019独角兽企业重金招聘Python工程师标准>>>
# -*- coding: UTF-8 -*- # tanj # 2018-01-15 #filename grab_idioms.py ''' 抓取成语字典 http://cy.5156edu.com/cymore.html 在线成语词典 ''' import ConfigParser import urllib2 import sys import MySQLdb from bs4 import BeautifulSoup config = None # get config def getConfig(): global config if config is None: config = ConfigParser.ConfigParser() config.read("config.ini") return config # get database connect def get_con(): global config config = getConfig() mysql_host = config.get('localdb', 'host') mysql_port = config.get('localdb', 'port') mysql_user = config.get('localdb', 'user') mysql_passwd = config.get('localdb', 'password') mysql_db = config.get('localdb', 'database') mysql_charset = config.get('localdb', 'charset') config = None conn = MySQLdb.connect(host=mysql_host, port=int(mysql_port), user=mysql_user, passwd=mysql_passwd, db=mysql_db, charset=mysql_charset) return conn def executelist(sql_list): if len(sql_list) > 0: conn = get_con() cursor = conn.cursor() for sql in sql_list: try: print sql cursor.execute(sql) except Exception, e: print"mysql query error: %s", e cursor.close() conn.commit() conn.close() pinyins=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z'] for pinyin_index in pinyins: print pinyin_index NUM = 1 while True: if NUM == 1: page = "" else: page = "_" + str(NUM) url = "http://cy.5156edu.com/html2/" + pinyin_index + page + ".html" NUM = NUM + 1 sql_list = [] try: print url response = urllib2.urlopen(url) html = response.read() except Exception, e: print e break if html != None: try: # 更换编码格式 details = html.decode("gbk").encode("utf-8") except Exception, e: print e try: # 更换编码格式 details = html.decode("gb2312").encode("utf-8") except Exception, e: print e soup = BeautifulSoup(details) all_div = soup.select('table')[0].select('u') for row in all_div: print row.string if page == "": page = "0" else: page = page.replace("_", "") sql = "insert into idioms_dic(chengyu,pinyin_index,page) values('" + row.string.strip() + "','" + pinyin_index + "','" + page + "')" print sql sql_list.append(sql) try: executelist(sql_list) except Exception, e: print e