使用StreamLoad实现数据同步到StarRocks

方法1:使用requests同步


from requests import Session
import base64


class LoadSession(Session):
    def rebuild_auth(self, prepared_request, response):
        """
        No code here means requests will always preserve the Authorization
        header when redirected.
        """


def main():
    """
    Stream load Demo with Standard Lib requests
    """
    username, password = 'root', ''
    headers={
        "Content-Type":  "text/html; charset=UTF-8",
        #"Content-Type":  "application/octet-stream",  # file upload
        "connection": "keep-alive",
        "max_filter_ratio": "0.2",
        "columns": "k,v",
        "column_separator": ',',
        "Expect": "100-continue",
    }
    payload = '''k1,v1\nk2,v2\nk3,v3'''
    database = 'starrocks_demo'
    tablename = 'tb1'
    api = 'http://master1:8030/api/%s/%s/_stream_load' % (database, tablename)
    session = LoadSession()
    session.auth = (username, password)
    response = session.put(url=api, headers=headers, data=payload)
    #response = session.put(url=api, headers=headers, data= open("a.csv","rb")) # file upload
    print(response.json())


if __name__ == '__main__':
    main()

方法2:数据保存成文件再执行终端命令同步

import subprocess
import time


class StarRocksClient(object):

    def __init__(self, host, port, database, columns, sep,
                username, password, filename, table, timeout):
        self.filename = filename
        self.table = table
        self.columns = columns
        self.sep = sep
        self.host = host
        self.port = port
        self.database = database
        self.user = username
        self.password = password
        self.timeout = timeout

    def get_label(self):        
        t = time.time().__str__().replace(".", "_")
        return '_'.join([self.database,self.table, t])

    def load(self):
        label = self.get_label()
        cmd = "curl"
        param_location = "--location-trusted"
        param_user = "%s:%s" % (self.user, self.password)
        param_file = "%s" % self.filename
        param_url = "http://%s:%s/api/%s/%s/_stream_load" % (
            self.host, self.port, self.database, self.table
        )
        p = subprocess.Popen([
            cmd, param_location,
            "-H", 'columns: %s' %self.columns,
            "-H", "column_separator: %s" %self.sep,
            "-H", "label: %s" %self.get_label(),
            "-H", "timeout: %d" %self.timeout,
            "-u", param_user,
            "-T", param_file,
            param_url])
        p.wait()
        if p.returncode != 0:
            print """\nLoad to starrocks failed! LABEL is %s""" % (label)
        else:
            print """\nLoad to starrocks success! LABEL is %s """ % (label)
        return label


if __name__ == '__main__':

    """
    -- Stream load Demo with Linux cmd - Curl
    --
    -- StarRocks DDL: 
    CREATE TABLE `starrocks_demo`.`tb1` (
      `k` varchar(65533) NULL COMMENT "",
      `v` varchar(65533) NULL COMMENT ""
    ) ENGINE=OLAP
    DUPLICATE KEY(`k`)
    COMMENT "OLAP"
    DISTRIBUTED BY HASH(`k`) BUCKETS 1
    PROPERTIES (
        "replication_num" = "1",
        "in_memory" = "false",
        "storage_format" = "DEFAULT"
    );
    """

    # load job 1
    client1 = StarRocksClient(
        host="master1",
        port="8030",
        database="starrocks_demo",
        username="root",
        password="",
        filename="/tmp/test.csv",    # data from local file /tmp/test.csv, usage: python CurlStreamLoad.py
        table="tb1",
        columns='k,v',
        sep=",",
        timeout=86400
    )
    client1.load()

    time.sleep(1)

    # load job 2
    client2 = StarRocksClient(
        host="master1",
        port="8030",
        database="starrocks_demo",
        username="root",
        password="",
        filename="-",                  # data from stdin, usage: echo 'k1,v1\nk2,v2'| python CurlStreamLoad.py
        table="tb1",
        columns='k,v',
        sep=",",
        timeout=86400
    )
    client2.load()

你可能感兴趣的:(StarRocks,python,python,大数据)