工作中的学习(日常更新)

列表推导式

x = [1,2,3,4,5]
out = []
for item in x:
    out.append(item**2)
out
#一行代码定义列表
x = [1,2,3,4,5]
out = [item ** 2 for item in x]
out

lambda表达式


double = lambda x:x*2
double(5)

Map


seq = [1,2,3,4,5]
result = list(filter(lambda x:x>2,seq))
result

arange

#np.arange(start,stop,step)
np.arange(3,7,2)

linspace

#np.linspace(start,stop,num)
np.linspace(2.0,3.0,num = 5)

df.apply

#df.apply
import pandas as pd 
df = pd.DataFrame([[4,9],]*3,columns=['A','B'],index = ["age","salary",'name'])
df 

import numpy as np
df.apply(np.sqrt)

df.apply(np.sum,axis = 1) #列

df.apply(np.sum,axis = 0) #行

pandas中Timestamp类

import pandas as pd
from datetime import datetime as dt
p1=pd.Timestamp(2017,6,19)
p2=pd.Timestamp(dt(2017,6,19,hour=9,minute=13,second=45))
p3=pd.Timestamp("2017-6-19 9:13:45")

p4=pd.to_datetime("2017-6-19 9:13:45")
p5=pd.to_datetime(dt(2017,6,19,hour=9,minute=13,second=45))

python时间转换

将python的datetime转换为unix时间戳

import time
import datetime
dtime = datetime.datetime.now()
ans_time = time.mktime(dtime.timetuple())
#将unix时间戳转换为python的datetime
unix_ts = 1439111214.0
time = datetime.datetime.fromtimestamp(unix_ts)
# 日期时间字符串
st = "2017-11-23 16:10:10"
# 当前日期时间
dt = datetime.datetime.now()
# 当前时间戳
sp = time.time()

# 1.把datetime转成字符串
def datetime_toString(dt):
    print("1.把datetime转成字符串: ", dt.strftime("%Y-%m-%d %H:%M:%S"))


# 2.把字符串转成datetime
def string_toDatetime(st):
    print("2.把字符串转成datetime: ", datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S"))


# 3.把字符串转成时间戳形式
def string_toTimestamp(st):
    print("3.把字符串转成时间戳形式:", time.mktime(time.strptime(st, "%Y-%m-%d %H:%M:%S")))


# 4.把时间戳转成字符串形式
def timestamp_toString(sp):
    print("4.把时间戳转成字符串形式: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(sp)))


# 5.把datetime类型转外时间戳形式
def datetime_toTimestamp(dt):
    print("5.把datetime类型转外时间戳形式:", time.mktime(dt.timetuple()))

# 1.把datetime转成字符串
datetime_toString(dt)
# 2.把字符串转成datetime
string_toDatetime(st)
# 3.把字符串转成时间戳形式
string_toTimestamp(st)
# 4.把时间戳转成字符串形式
timestamp_toString(sp)
# 5.把datetime类型转外时间戳形式
datetime_toTimestamp(dt)

between_time()

i = pd.date_range('2018-04-09', periods=4, freq='12H')
>>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
>>> ts
                     A
2018-04-09 00:00:00  1
2018-04-09 12:00:00  2
2018-04-10 00:00:00  3
2018-04-10 12:00:00  4
>>> ts.at_time('12:00')
 i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
>>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
>>> ts
                     A
2018-04-09 00:00:00  1
2018-04-10 00:20:00  2
2018-04-11 00:40:00  3
2018-04-12 01:00:00  4

>>> ts.between_time('0:15', '0:45')
                     A
2018-04-10 00:20:00  2
2018-04-11 00:40:00  3
通过设置晚于以下设置,您获得的时间不会介于两次之间 :start_timeend_time

>>> ts.between_time('0:45', '0:15')
                     A
2018-04-09 00:00:00  1
2018-04-12 01:00:00  4

2020.5.26 阴天 希望能养成随手记的习惯
1.dataframe series取最大值或最小值的索引

df.idxmax(self,axis=0,skipna=True) = df.argmax()

2.dataframe中的object转成需要的类型

df = df.convert_objects()
filedf['pub_date2']=pd.to_datetime(filedf['pub_date'],format='%m/%d/%Y %H:%M:%S')

3.sklearn多项式回归

from sklearn.preprocessing import PolynomialFeatures
ploy =  PolynomialFeatures(degree=)
ploy.fit(x)
x2 = ploy.transform(x)

from sklearn.Linear_model import LinearRegression
reg   = LinearRegression()
reg.fit(x2,y)
y_pred = reg.predict(x2)
plt.scatter(x,y)
plt.plot(np.sort(x),y_pred[np.agrsort(x)]

2020.5.27 多云

1.break continue

break结束这个循环

continue结束本次循环

2.scipy.spatial

scipy.spatial.distance_matrix()

3.读取文件夹中的所有文件

import os
files = os.listdir(path)
for file in files:
    if not os.path.isdir(file)#判断是否是文件夹,不是文件夹才能继续

4.group

group = df.groupby()
group.count()
group.size()

5.series.value_counts().to_dict()

6.Counter

from collections import Counter
Counter(data)

2020.5.28 晴天
1.对一簇经纬度点,进行凸包计算

from scipy.spatial import ConvexHull
points = np.random.rand(30, 2)
hull = ConvexHull(points)
plt.plot(points[:,0], points[:,1], 'o')
# hull.vertices 得到凸轮廓坐标的索引值,逆时针画
hull1=hull.vertices.tolist()#要闭合必须再回到起点[0]
hull1.append(hull1[0])
plt.plot(points[hull1,0], points[hull1,1], 'r--^',lw=2)
for i in range(len(hull1)-1):
    plt.text(points[hull1[i],0], points[hull1[i],1],str(i),fontsize=20)

参考链接:https://blog.csdn.net/qq_23298649/article/details/103869985

2020.6.3 雨天
Python DataFrame一列拆成多列以及一行拆成多行
参考链接:https://www.jb51.net/article/167001.htm

2020.6.16上海的梅雨季

#1.一行代码合并两个字典
{**{'a':1,'b':2},**{'c':3}}

#2.一行代码求多个列表中的最大值
max(max([ [1,2,3], [5,1], [4] ], key=lambda v: max(v)))

#3.一行代码生成逆序序列
list(range(10,-1,-1))

#4.一行代码完成数据透视
pd.pivot_table(df, index=['Manager', 'Rep'], values=['Price'], aggfunc=np.sum)

#5.在函数中设定过滤条件,迭代元素,保留返回值为 True 的元素
fil = filter(lambda x: x>10,[1,11,2,45,7,6,13])
list(fil)

#6.格式化输出字符串,format(value, format_spec)实质上是调用了value的format(format_spec)方法。
print("i am {0},age{1}".format("tom",18))

#7.反向迭代
reversed([1,4,2,3,1])

#8.返回一个表示由 range(start, stop, step) 所指定索引集的 slice对象
a = [1,4,2,3,1]
my_slice = slice(0,5,2)
a[my_slice]

#9.排序
a = [{'name':'xiaoming','age':18,'gender':'male'},{'name':': xiaohong','age':20,'gender':'female'}]
sorted(a,key=lambda x: x['age'],reverse=False)

#10.zip
x = [3,2,1]
y = [4,5,6]
list(zip(y,x))

a = range(5)
b = list('abcde')
[str(y) + str(x) for x,y in zip(a,b)]

#10.按条件分组
def bif_by(lst, f):
  return [ [x for x in lst if f(x)],[x for x in lst if not f(x)]]
records = [25,89,31,34]
bif_by(records, lambda x: x<80) # [[25, 31, 34], [89]]

画图

#Matplotlib中的日期与时间间隔图
import datetime
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt

# dates for xaxis
event_date = [datetime.datetime(2008, 12, 3), datetime.datetime(2009, 1, 5), datetime.datetime(2009, 2, 3)]

# base date for yaxis can be anything, since information is in the time
anydate = datetime.date(2001,1,1)

# event times
event_start = [datetime.time(20, 12), datetime.time(12, 15), datetime.time(8, 1,)]
event_finish = [datetime.time(23, 56), datetime.time(16, 5), datetime.time(18, 34)]

# translate times and dates lists into matplotlib date format numpy arrays
start = np.fromiter((mdates.date2num(datetime.datetime.combine(anydate, event)) for event in event_start), dtype = 'float', count = len(event_start))
finish = np.fromiter((mdates.date2num(datetime.datetime.combine(anydate, event)) for event in event_finish), dtype = 'float', count = len(event_finish))
date = mdates.date2num(event_date)

# calculate events durations
duration = finish - start

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

# use errorbar to represent event duration
ax.errorbar(date, start, [np.zeros(len(duration)), duration], linestyle = '')
# make matplotlib treat both axis as times
ax.xaxis_date()
ax.yaxis_date()

plt.show()

SCIPY
求局部极值(或者求波峰波谷)

import numpy as np 
import pylab as pl
import matplotlib.pyplot as plt
import scipy.signal as signal
x=np.array([
    0, 6, 25, 20, 15, 8, 15, 6, 0, 6, 0, -5, -15, -3, 4, 10, 8, 13, 8, 10, 3,
    1, 20, 7, 3, 0 ])
plt.figure(figsize=(16,4))
plt.plot(np.arange(len(x)),x)
print x[signal.argrelextrema(x, np.greater)]
print signal.argrelextrema(x, np.greater)

plt.plot(signal.argrelextrema(x,np.greater)[0],x[signal.argrelextrema(x, np.greater)],'o')
plt.plot(signal.argrelextrema(-x,np.greater)[0],x[signal.argrelextrema(-x, np.greater)],'+')
# plt.plot(peakutils.index(-x),x[peakutils.index(-x)],'*')
plt.show()
————————————————
版权声明:本文为CSDN博主「weijifen000」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weijifen000/article/details/80070520/

[https://www.jb51.net/article/180654.htm](https://www.jb51.net/article/180654.htm)

带有AM和PM的时间转换

import datetime#字符串时间转换加变换时区
def timeutc(st):
    tl=st.split(' ')
    if tl[-1]=='PM':
        hm=tl[-2].split(':')
        h=int(hm[0])+12
        m=hm[1]
        s=hm[2]
    else:
        hm=tl[-2].split(':')
        h=hm[0]
        m=hm[1]
        s=hm[2]
    y=tl[0].split('/')[2]
    mo=tl[0].split('/')[0]
    d=tl[0].split('/')[1]
#如果是小时是24,因为小时只能从0-23,所以,转换到第二天的0点
    if h==24:
        d=str(int(d)+1)
        h=0
        m = '00'
        s='00'
    old = datetime.datetime(int(y), int(mo), int(d),int(h),int(m),int(s))#转换为datetime格式
    new=old.__format__('%Y-%m-%d %H:%M:%S')#格式化输出
    return new
#但是这种24点没法弄
from datetime import datetime
date_string = '2009-11-29 03:17:00 PM'
format = '%Y-%m-%d %I:%M:%S %p'
my_date = datetime.strptime(date_string, format) 

https://www.cnblogs.com/fwl8888/p/9635505.html

滑动回归

#1.
from pyfinance.ols import OLS, RollingOLS, PandasRollingOLS
y = data.usd
x = data.drop('usd', axis=1)
window = 12 # months
model = PandasRollingOLS(y=y, x=x, window=window) 
print(model.beta.head())
#2.
df['slope'] = df.values.rolling(window=125).apply(lambda x: np.polyfit(np.array(range(0,125)), x, 1)[0], raw=True)
#3.
[https://blog.csdn.net/weixin_30701575/article/details/97739761](https://blog.csdn.net/weixin_30701575/article/details/97739761)
#4.滑动滤波
[https://cloud.tencent.com/developer/article/1451488](https://cloud.tencent.com/developer/article/1451488)
#5.滑动窗口函数
[https://www.cnblogs.com/nxf-rabbit75/p/10669516.html](https://www.cnblogs.com/nxf-rabbit75/p/10669516.html)
#6.LSTM滑动预测
[https://blog.csdn.net/zhonglongshen/article/details/94555337](https://blog.csdn.net/zhonglongshen/article/details/94555337)

时区转换

import time
format1="%Y-%m-%d %H:%M:%S+08:00"
format2="%Y-%m-%d %H:%M:%S"
t1 = '2020-03-11 17:15:07+00:00'
t = time.strptime(t1, format1)
t2 = time.strftime(format2,t)
print(t2)

python dataframe操作
https://www.cnblogs.com/luowei93/p/11878639.html

pandas groupby agg

num_agg = {'Age':['min', 'mean', 'max']}
df.groupby('Country').agg(num_agg)

Python计算时间差

import datetime
def time_differ(date1='12:55:05',date2='13:15:05'):
    date1=datetime.datetime.strptime(date1,"%H:%M:%S")
    date2=datetime.datetime.strptime(date2,"%H:%M:%S")
    if date1 < date2:        
        return date2-date1
    else:
        return date1-date2
date1 = '13:05:05'
date2 = '13:15:05'
differ = time_differ(date1,date2)
print(differ)

字符串时间差转换

"""
'1157 days, 9:46:39'
'12:00:01.824952'
'-1 day, 23:59:31.859767'
"""
import re

def parse(s):
    if 'day' in s:
        m = re.match(r'(?P[-\d]+) day[s]*, (?P\d+):(?P\d+):(?P\d[\.\d+]*)', s)
    else:
        m = re.match(r'(?P\d+):(?P\d+):(?P\d[\.\d+]*)', s)
    return {key: float(val) for key, val in m.groupdict().iteritems()}

from datetime import timedelta

s1 = '1157 days, 9:46:39'
s2 = '12:00:01.824952'
s3 = '-1 day, 23:59:31.859767'
t1 = parse(s1)
t2 = parse(s2)
t3 = parse(s3)

timedelta(**t1) # datetime.timedelta(1157, 35199)
timedelta(**t2) # datetime.timedelta(0, 43201, 824952)
timedelta(**t3) # datetime.timedelta(-1, 86371, 859767)
##或者
import datetime
str="15:00:00"
strtime=str.split(":")
strtimedelta=datetime.timedelta(hours=int(strtime[0]),minutes=int(strtime[1]),seconds=int(strtime[2]))

SQL中 OVER(PARTITION BY) 取上一条,下一条等

https://www.cnblogs.com/zhqian/p/9140313.html

#计算某个字段在当前记录和下一条记录之间的差
select name, 
        hiredate, 
        next_hiredate,
        next_hiredate - hiredate as diff
from (
    select name, 
           hiredate,
           lead(hiredate)over(order by hiredate) next_hiredate
    from dataset
)

hausdorff距离##

from shapely.geometry import Polygon
point = Point(1, 1)
 line = LineString([(2, 0), (2, 4), (3, 4)])
point.hausdorff_distance(line)
point.distance(Point(3, 4))

判断点是否在多边形内
http://gnss.help/2019/06/09/check-point-in-polygon/index.html

from shapely.geometry import Point
from shapely.geometry import Polygon
polygon2 = Polygon([(lon, lat) for lon, lat in points])
point = Point(122.35, 29.70)
polygon2.contains(point)

线段相交

from shapely.geometry import LineString
coords = [(0, 0), (1, 1)]
LineString(coords).crosses(LineString([(0, 1), (1, 0)]))

pywt
https://www.cnblogs.com/junge-mike/p/12761502.html

https://mne.tools/stable/auto_examples/preprocessing/plot_xdawn_denoising.html#sphx-glr-auto-examples-preprocessing-plot-xdawn-denoising-py

https://medium.com/impulse-neiry/simple-p300-classifier-on-open-data-27e906f68b83

http://europepmc.org/article/PMC/5698603

滤波
https://www.cnblogs.com/iwuqing/p/11380131.html

https://www.cnblogs.com/sunlightheart/p/12574848.html

https://www.cnblogs.com/sunlightheart/p/12542842.html

https://blog.csdn.net/shanwenkang/article/details/84345178

字符串的时间做差

#1.
pd.DataFrame(pd.to_datetime(time_df['END_TIME']) - pd.to_datetime(time_df['START_TIME']))
#2.
def time_delta(a,b):
    return (datetime.strptime(a, "%Y-%m-%d %H:%M:%S")-datetime.strptime(b, "%Y-%m-%d %H:%M:%S")).total_seconds()/3600

df['delta'] = df.apply(lambda x: time_delta(x['end_postime'],x['start_postime']),axis=1)

matplotlib画图

import numpy as np
import matplotlib.pyplot as plt
y = np.arange(1,10,1)
x = np.arange(1,10,1)
bwith = 2 #边框宽度设置为2
ax = plt.gca()#获取边框
plt.tick_params(axis='both',colors='gold') #设置坐标刻度和字体颜色
ax.spines['top'].set_color('red')  # 设置上边框为红色
ax.spines['right'].set_color('none')  # 设置上边框为无色
ax.spines['bottom'].set_linewidth(bwith)
ax.spines['left'].set_linewidth(bwith)
ax.spines['top'].set_linewidth(bwith)
ax.spines['right'].set_linewidth(bwith)
plt.grid( color = 'black',linestyle='-.',linewidth = 1)
plt.plot(x,y)

https://blog.csdn.net/wuzlun/article/details/80059222

https://www.cnblogs.com/zhizhan/p/5615947.html

pyplot画多个图

import plotly.graph_objects as go
trace0 = go.Scatter(x=weather.index, y=weather.speed, mode='lines+markers',marker=dict(opacity=0.4), name='speed')
trace1 = go.Scatter(x=weather.index, y=weather.wind, mode='lines+markers', marker=dict(opacity=0.5),name='wind',yaxis="y2")
data1 = [trace0, trace1]

# go.Layout可以创建图层对象,实现双坐标
layout = go.Layout(title="双坐标示例图",
                   yaxis=dict(title="wind"),
                   yaxis2=dict(title="speed", overlaying='y', side="right"),legend=dict(x=0, y=1, font=dict(size=10, color="black"),orientation='h'),activeshape=dict(opacity=1))
fig = go.Figure(data=data1, layout=layout)
fig.show()

plotly画多个子图

from plotly import subplots

# 设定布局,以便进行绘图,这儿是两行一列
fig = subplots.make_subplots(rows=2,cols=1)
trace0 = go.Scatter(x=weather.index, y=weather.speed, mode='lines+markers',marker=dict(opacity=0.4), name='speed')
trace1 = go.Scatter(x=weather.index, y=weather.wind, mode='lines+markers', marker=dict(opacity=0.5),name='wind',yaxis="y2")
fig.append_trace(trace1,1,1)
fig.append_trace(trace0,2,1)

# 设定每个子图的占位情况
fig.layout.yaxis1.domain = [0.35,1.0]
fig.layout.yaxis2.domain = [0,0.3]

# 设定整个fig的大小
fig.layout.width = 800
fig.layout.height = 600
fig.show()

python读取sql

conn1 = psycopg2.connect(host="c",user="datareader",password="",port=,database="")
route1 = pd.read_sql(sql,con=conn1,parse_dates={'postime':{'origin':'unix'}})

python re正则表达式

re.findall("\d+", str) #找到字符串中的数字
re.findall("\D+", str) #找到字符串中的非数字
re.sub('\d+','',str) #替换字符串中的数字为空

两直线的夹角
https://blog.csdn.net/jizhidexiaoming/article/details/100009138

判断3个点是否为直线
https://blog.csdn.net/Changxing_J/article/details/107102182

hough变换检测直线(python)
https://blog.csdn.net/wss794/article/details/93023013

几个地理packages
https://github.com/pbrod/nvector/
https://fiona.readthedocs.io/en/latest/
pySAL

Calculating coordinates given a bearing and a distance

python绘制QQ图

import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np

n = 100
samples = st.norm.rvs(loc = 5, scale = 2, size = n)

samples_sort = sorted(samples)

x_labels_p = np.arange(1/(2*n), 1, 1/n)
y_labels_p = st.norm.cdf(samples_sort, loc = 5, scale = 2)

plt.scatter(x_labels_p, y_labels_p)
plt.title('PP plot for normal distribution samle')
plt.show()


x_labels_q = samples_sort
y_labels_q = st.norm.ppf(x_labels_p, loc = 5, scale = 2)

plt.scatter(x_labels_q, y_labels_q)
plt.title('QQ plot for normal distribution samle')
plt.show()

import statsmodels.api as sm
probplot = sm.ProbPlot(samples, dist = st.norm, loc = 5, scale = 2)
probplot.qqplot(line='45')
from scipy import stats
import numpy as np
x = np.arange(-5, 5, 0.1)
y = stats.norm.cdf(x, 0, 1)
plt.plot(x, y)

import pandas as pd
churn_raw_data = pd.read_csv('churn.txt')
day_minute = churn_raw_data['Day Mins']
sorted_ = np.sort(day_minute)
yvals = np.arange(len(sorted_))/float(len(sorted_))
plt.plot(sorted_, yvals)

x_label = stats.norm.ppf(yvals)  #对目标累计分布函数值求标准正太分布累计分布函数的逆
plt.scatter(x_label, sorted_)

stats.probplot(day_minute, dist="norm", plot=plt)
plt.show()

ks检验

from scipy.stats import shapiro, kstest
k1, p1 = kstest(arr_same, 'norm')

验证是否符合泊松分布

data <- rpois(n = 100, 20)
mean <- mean(data)
poisson.test(sum(data), length(data), mean)
#P值越大说明数据符合度越好

嵌套列表展开

list_1 = [[1, 2], [3, 4, 5], [6, 7], [8], [9]]
list_2 = sum(list_1, [])
print(list_2)

找出值大于某个数的键

{k:v for k, v in test_dict.items() if v>=3}

嵌套列表计数

from collections import Counter
from itertools import chain
Counter(chain.from_iterable(test))

pandas技巧
https://mp.weixin.qq.com/s/pqJ_TvM5fnwvKzg2e2-MSQ

import pandas as pd

df ={'姓名':[' 黄同学','黄至尊','黄老邪 ','陈大美','孙尚香'],
     '英文名':['Huang tong_xue','huang zhi_zun','Huang Lao_xie','Chen Da_mei','sun shang_xiang'],
     '性别':['男','women','men','女','男'],
     '身份证':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
     '身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
     '家庭住址':['湖北广水','河南信阳','广西桂林','湖北孝感','广东广州'],
     '电话号码':['13434813546','19748672895','16728613064','14561586431','19384683910'],
     '收入':['1.1万','8.5千','0.9万','6.5千','2.0万']}
df = pd.DataFrame(df)
df

#cat函数:用于字符串的拼接
df["姓名"].str.cat(df["家庭住址"],sep='-'*3)

#contains:判断某个字符串是否包含给定字符
df["家庭住址"].str.contains("广")

# startswith/endswith:判断某个字符串是否以…开头/结尾
df["姓名"].str.startswith("黄") 
df["英文名"].str.endswith("e")

#get:获取指定位置的字符串
df["姓名"].str.get(-1)
df["身高"].str.split(":")
df["身高"].str.split(":").str.get(0)

#len:计算字符串长度
df["性别"].str.len()

#upper/lower:英文大小写转换
df["英文名"].str.upper()
df["英文名"].str.lower()

#pad+side参数/center:在字符串的左边、右边或左右两边添加给定字符
df["家庭住址"].str.pad(10,fillchar="*")      # 相当于ljust()
df["家庭住址"].str.pad(10,side="right",fillchar="*")    # 相当于rjust()
df["家庭住址"].str.center(10,fillchar="*")

#repeat:重复字符串几次
df["性别"].str.repeat(3)

#slice_replace:使用给定的字符串,替换指定的位置的字符
df["电话号码"].str.slice_replace(4,8,"*"*4)

#replace:将指定位置的字符,替换为给定的字符串
df["身高"].str.replace(":","-")

#replace:将指定位置的字符,替换为给定的字符串(接受正则表达式)
df["收入"].str.replace("\d+\.\d+","正则")

df

#split方法+expand参数:搭配join方法功能很强大
# 普通用法
df["身高"].str.split(":")
# split方法,搭配expand参数
df[["身高描述","final身高"]] = df["身高"].str.split(":",expand=True)
# split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)

#strip/rstrip/lstrip:去除空白符、换行符
df["姓名"].str.len()
df["姓名"] = df["姓名"].str.strip()
df["姓名"].str.len()

#findall:利用正则表达式,去字符串中匹配,返回查找结果的列表
df["身高"]
df["身高"].str.findall("[a-zA-Z]+")

#extract/extractall:接受正则表达式,抽取匹配的字符串(一定要加上括号)
df["身高"].str.extract("([a-zA-Z]+)")
# extractall提取得到复合索引
df["身高"].str.extractall("([a-zA-Z]+)")
# extract搭配expand参数
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True)

字符串转换为带时区的时间

pd.Timestamp(x,tz='Asia/ShangHai')
#或者
def time_to_CN(arrive_time):
    '''utc时间转化为中国时间并转为str'''
    return str(arrive_time.tz_convert('Asia/Shanghai'))

多层列表压成一层

from funcy import flatten
b = list(flatten(list))

只保留字符串中的字母和中文

import re
def remove_punctuation(mystr):
    reg = "[^A-Za-z\u4e00-\u9fa5]"
    new = re.sub(reg, '', mystr)
    return new

你可能感兴趣的:(工作中的学习(日常更新))