Pandas分析Nginx日志求pv, uv

安装所需Python库

$ pip install numpy pandas matplotlib

注:所有工作都在Python 3中实践 (Python 2应该也没有问题)
Nginx日志格式

$remote_addr - $remote_user [$time_local] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent"

注:Nginx 日志格式可以自己设置,根据日志格式修改下面的匹配正则表达式
分析日志
加载日志

import time
import pytz
import pandas as pd
import matplotlib.pyplot as plt

from datetime import datetime

start_time = time.time()


def parse_str(s):
    """
    Return the string.

    Example:
    `>>> parse_str('[some string]')`
    `'some string'`
    """
    return s[1:-1]

time_bigin_compare = datetime.strptime("2017-06-02 00:00:00", "%Y-%m-%d %H:%M:%S")

time_end_compare = datetime.strptime("2017-06-02 00:00:00", "%Y-%m-%d %H:%M:%S")


def parse_datetime(date):
    """
    Return datetime.

    Parse datetime with timezone format as:
        `[day/month/year:hour:minute:second timezone]`

    Example:
    `>>> parse_datetime('17/Jan/2017:13:00:52 +0800')`
    `datetime.datetime(2017, 01, 17, 13, 00, 52, tzinfo=)`
    """
    dt = datetime.strptime(date[1:-7], '%d/%b/%Y:%H:%M:%S')



    if time_bigin_compare <= dt <= time_end_compare:
        print(dt)
        dt_tz = int(date[-6:-3]) * 60 + int(date[-3:-1])
        # return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
        return dt.replace(tzinfo=pytz.FixedOffset(dt_tz))
    else:
        pass


def top_remote_ip(df, n=5):
    remote_ip = df.groupby('remote_ip')['remote_ip'].agg(len)
    # remote_ip = remote_ip.divide(remote_ip.sum())
    sorted_ip = remote_ip.sort_values()[-n:]
    ax = sorted_ip.plot(kind='barh', title='Remote Access', rot=45, alpha=0.75)
    ax.set_xlabel('Access Count')
    ax.set_ylabel('Remote IP')
    plt.show()


def load_data(filename):
    df = pd.read_table(
        filename,
        sep='\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
        engine='python',
        na_values='-',
        header=None,
        usecols=[0, 3, 4, 5, 6, 7, 8],
        names=['remote_ip', 'date', 'request', 'status', 'size', 'referer', 'user_agent'],
        converters={
            'date': parse_datetime,
            'request': parse_str,
            'status': int,
            'size': int,
            'referer': parse_str,
            'user_agent': parse_str
        }
    )

    # print(df.count())

    print("pv: {}".format(df.groupby('remote_ip')['remote_ip'].agg(len).sum()))

time_start_compare = parse_datetime('[1/Jun/2017:00:00:00 +0800]')
    time_end_compare = parse_datetime('[2/Jun/2017:00:00:00 +0800]')

    # 时间过滤
    df = df.query('@time_start_compare<=date<=@time_end_compare')

    # 去重
    df.drop_duplicates(subset=['remote_ip'], keep="last", inplace=True)

    print("uv: {}".format(df.groupby('remote_ip')['remote_ip'].agg(len).sum()))

    return df

访问次数最多的IP地址

def top_remote_ip(df, n=5):
    remote_ip = df.groupby('remote_ip')['remote_ip'].agg(len)
    # remote_ip = remote_ip.divide(remote_ip.sum())
    sorted_ip = remote_ip.sort_values()[-n:]
    ax = sorted_ip.plot(kind='barh', title='Remote Access', rot=45, alpha=0.75)
    ax.set_xlabel('Access Count')
    ax.set_ylabel('Remote IP')
    plt.show()

被请求次数最多的API
def top_request_api(df, n=5):
    # request = df['request']
    path = df['request']
    # path = request.str.extract('\S+\s*(\S+)')
    # path.value_counts()[:n].plot(kind='pie')
    path.value_counts()[:n].plot(kind='bar')
    plt.show()

请求次数最多的 HTTP 方法
def top_request_method(df):
    method = df['request'].str.extract('(\S+)')
    method.value_counts().plot(kind='barh')
    plt.show()

访问频率
def access_rate_base_datetime(df, rule='D', begin=None, end=None):
    visits = df['request'].copy()
    visits.index = df['date']
    visits = visits.resample(rule, kind='period').count()
    if begin and end:
        visits = visits[begin:end]
    elif begin:
        visits = visits[begin:]
    elif end:
        visits = visits[:end]

    visits.plot()
    plt.title('Total visits')
    plt.ylabel('vistis')
    plt.xlabel('datetime')
    plt.show()

测试
if __name__ == '__main__':
    import matplotlib.pyplot as plt
    filename = './access.log'
    df = load_csv(filename)
    # print(df.head())
    # top_remote_ip(df, 10)
    # top_request_api(df)
    # top_request_method(df)
    access_rate_base_datetime(df, 'H', '2017-02-17', '2017-02-19')

结果


image.png

转自:https://www.jianshu.com/p/6b7d63534235

你可能感兴趣的:(Pandas分析Nginx日志求pv, uv)