pymysql

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author: xxx

import string
import sys
import csv
import json
import logging
import random
import pymysql
import os
import sys
# import jieba
# import jieba.posseg
# import jieba.analyse
# import json
import time

#from text_quality_classifier import para_sims_tagger_feature_exact
#from text_quality_classifier import layout_tagger_feature_extract
#from content_base_feature import content_base_fea_extract
#from custom_dict_feature import feature_custom_dict_v4_train
#from infor_entropy_fea import part_of_speech_extract_jieba

# 设定日志级别和格式
logging.basicConfig(
    level=logging.FATAL,
    format=
    '%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')

def connect_db():
    """mysql链接
    """
    return pymysql.connect(host='10.xxx.83.15',
                           port=5203,
                           user='om_article_r',
                           password='0f59ee296',
                           database='om_article',
                           charset='utf8')

# 链接
con = connect_db()
cur = con.cursor()

def select_from_sql(cmsid):
    """查询
    """
    # 计算时间
    date = cmsid
    if date[0].isdigit():
        date = date[:6]
    else:
        date = date[3:9]
    # 拼接查询串
    sql_str = ("select media_id,title,content "
                + " from news_article_%s where cmsid='%s' limit 1" % (date, cmsid))
    logging.info(sql_str)

    try:
        # 查询
        cur.execute(sql_str)
        # 获取
        row = cur.fetchone()
        return row
    except:
        return ("", "", "")

def get_baicao_metas(art_file, out_file):
    """请求baicao,获取内容
    """
    # 去重结果
    results = {}
    lst_cmsid = []
    lst_label = []
    lst_media_id = []
    lst_title = []
    lst_content = []
    # fw = open(out_file, "w", encoding='utf-8')
    # # 写表头
    # fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")
    with open(out_file, "w", encoding='utf-8') as fw:
        fw.write("cmsid\tlabel\tmedia_id\ttitle\tcontent\n")

    # fr = open(art_file, 'r', encoding='utf-8')
    with open(art_file, 'r', encoding='utf-8') as fr:
        lines = fr.readlines()
        for line in lines[1:]:
            try:
                line_list = line.strip().split('\t')
                label = line_list[0]
                cmsid = line_list[1]
                # 请求百草mysql
                media_id, title, content = select_from_sql(cmsid)
                content = content.replace("\n", "").replace("\r", "").replace("\t", "")
                title = title.replace("\n", "").replace("\r", "").replace("\t", "")
                lst_cmsid.append(cmsid)
                lst_label.append(label)
                lst_media_id.append(media_id)
                lst_title.append(title)
                lst_content.append(content)
                # featrue_list=[]#,feature_size,paras  = nlp_fea_extrt.feature_extract(title,content_html)
                # content = line_list[4]
                if title == "":
                    logging.warn("not found: " + cmsid)
                    continue
                # with open(out_file, "w", encoding='utf-8') as fw:
                #     fw.write(cmsid + '\t' + label + '\t' + media_id + '\t' + title + '\t' + content + '\n')
                print(len(lst_cmsid))
            except:
                continue
            # 控制频率
            time.sleep(0.0)
    with open(out_file, "a", encoding='utf-8') as fw:
        for i in range(len(lst_cmsid)):
            fw.write(lst_cmsid[i] + '\t' + lst_label[i] + '\t' + lst_media_id[i] + '\t' + lst_title[i] + '\t' + lst_content[i] + '\n')
    cur.close()
    con.close()

art_file = '../ft_local/titles_n_o_2.eval.a.1_30'
out_file = '../ft_local/titles_n_o_2_out.eval.a.1_30'
get_baicao_metas(art_file, out_file)
# # usage
# if len(sys.argv) < 3:
#     print("usage:")
#     print("\t./request_baicao.py corpus[IN] corpus.new[OUT]")
#     sys.exit(1)
#
# # run
# get_baicao_metas(sys.argv[1], sys.argv[2])
# sys.exit(0)

你可能感兴趣的:(pymysql)