列表推导式
x = [1,2,3,4,5]
out = []
for item in x:
out.append(item**2)
out
#一行代码定义列表
x = [1,2,3,4,5]
out = [item ** 2 for item in x]
out
lambda表达式
double = lambda x:x*2
double(5)
Map
seq = [1,2,3,4,5]
result = list(filter(lambda x:x>2,seq))
result
arange
#np.arange(start,stop,step)
np.arange(3,7,2)
linspace
#np.linspace(start,stop,num)
np.linspace(2.0,3.0,num = 5)
df.apply
#df.apply
import pandas as pd
df = pd.DataFrame([[4,9],]*3,columns=['A','B'],index = ["age","salary",'name'])
df
import numpy as np
df.apply(np.sqrt)
df.apply(np.sum,axis = 1) #列
df.apply(np.sum,axis = 0) #行
pandas中Timestamp类
import pandas as pd
from datetime import datetime as dt
p1=pd.Timestamp(2017,6,19)
p2=pd.Timestamp(dt(2017,6,19,hour=9,minute=13,second=45))
p3=pd.Timestamp("2017-6-19 9:13:45")
p4=pd.to_datetime("2017-6-19 9:13:45")
p5=pd.to_datetime(dt(2017,6,19,hour=9,minute=13,second=45))
python时间转换
将python的datetime转换为unix时间戳
import time
import datetime
dtime = datetime.datetime.now()
ans_time = time.mktime(dtime.timetuple())
#将unix时间戳转换为python的datetime
unix_ts = 1439111214.0
time = datetime.datetime.fromtimestamp(unix_ts)
# 日期时间字符串
st = "2017-11-23 16:10:10"
# 当前日期时间
dt = datetime.datetime.now()
# 当前时间戳
sp = time.time()
# 1.把datetime转成字符串
def datetime_toString(dt):
print("1.把datetime转成字符串: ", dt.strftime("%Y-%m-%d %H:%M:%S"))
# 2.把字符串转成datetime
def string_toDatetime(st):
print("2.把字符串转成datetime: ", datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S"))
# 3.把字符串转成时间戳形式
def string_toTimestamp(st):
print("3.把字符串转成时间戳形式:", time.mktime(time.strptime(st, "%Y-%m-%d %H:%M:%S")))
# 4.把时间戳转成字符串形式
def timestamp_toString(sp):
print("4.把时间戳转成字符串形式: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(sp)))
# 5.把datetime类型转外时间戳形式
def datetime_toTimestamp(dt):
print("5.把datetime类型转外时间戳形式:", time.mktime(dt.timetuple()))
# 1.把datetime转成字符串
datetime_toString(dt)
# 2.把字符串转成datetime
string_toDatetime(st)
# 3.把字符串转成时间戳形式
string_toTimestamp(st)
# 4.把时间戳转成字符串形式
timestamp_toString(sp)
# 5.把datetime类型转外时间戳形式
datetime_toTimestamp(dt)
between_time()
i = pd.date_range('2018-04-09', periods=4, freq='12H')
>>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
>>> ts
A
2018-04-09 00:00:00 1
2018-04-09 12:00:00 2
2018-04-10 00:00:00 3
2018-04-10 12:00:00 4
>>> ts.at_time('12:00')
i = pd.date_range('2018-04-09', periods=4, freq='1D20min')
>>> ts = pd.DataFrame({'A': [1,2,3,4]}, index=i)
>>> ts
A
2018-04-09 00:00:00 1
2018-04-10 00:20:00 2
2018-04-11 00:40:00 3
2018-04-12 01:00:00 4
>>> ts.between_time('0:15', '0:45')
A
2018-04-10 00:20:00 2
2018-04-11 00:40:00 3
通过设置晚于以下设置,您获得的时间不会介于两次之间 :start_timeend_time
>>> ts.between_time('0:45', '0:15')
A
2018-04-09 00:00:00 1
2018-04-12 01:00:00 4
2020.5.26 阴天 希望能养成随手记的习惯
1.dataframe series取最大值或最小值的索引
df.idxmax(self,axis=0,skipna=True) = df.argmax()
2.dataframe中的object转成需要的类型
df = df.convert_objects()
filedf['pub_date2']=pd.to_datetime(filedf['pub_date'],format='%m/%d/%Y %H:%M:%S')
3.sklearn多项式回归
from sklearn.preprocessing import PolynomialFeatures
ploy = PolynomialFeatures(degree=)
ploy.fit(x)
x2 = ploy.transform(x)
from sklearn.Linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x2,y)
y_pred = reg.predict(x2)
plt.scatter(x,y)
plt.plot(np.sort(x),y_pred[np.agrsort(x)]
2020.5.27 多云
1.break continue
break结束这个循环
continue结束本次循环
2.scipy.spatial
scipy.spatial.distance_matrix()
3.读取文件夹中的所有文件
import os
files = os.listdir(path)
for file in files:
if not os.path.isdir(file)#判断是否是文件夹,不是文件夹才能继续
4.group
group = df.groupby()
group.count()
group.size()
5.series.value_counts().to_dict()
6.Counter
from collections import Counter
Counter(data)
2020.5.28 晴天
1.对一簇经纬度点,进行凸包计算
from scipy.spatial import ConvexHull
points = np.random.rand(30, 2)
hull = ConvexHull(points)
plt.plot(points[:,0], points[:,1], 'o')
# hull.vertices 得到凸轮廓坐标的索引值,逆时针画
hull1=hull.vertices.tolist()#要闭合必须再回到起点[0]
hull1.append(hull1[0])
plt.plot(points[hull1,0], points[hull1,1], 'r--^',lw=2)
for i in range(len(hull1)-1):
plt.text(points[hull1[i],0], points[hull1[i],1],str(i),fontsize=20)
参考链接:https://blog.csdn.net/qq_23298649/article/details/103869985
2020.6.3 雨天
Python DataFrame一列拆成多列以及一行拆成多行
参考链接:https://www.jb51.net/article/167001.htm
2020.6.16上海的梅雨季
#1.一行代码合并两个字典
{**{'a':1,'b':2},**{'c':3}}
#2.一行代码求多个列表中的最大值
max(max([ [1,2,3], [5,1], [4] ], key=lambda v: max(v)))
#3.一行代码生成逆序序列
list(range(10,-1,-1))
#4.一行代码完成数据透视
pd.pivot_table(df, index=['Manager', 'Rep'], values=['Price'], aggfunc=np.sum)
#5.在函数中设定过滤条件,迭代元素,保留返回值为 True 的元素
fil = filter(lambda x: x>10,[1,11,2,45,7,6,13])
list(fil)
#6.格式化输出字符串,format(value, format_spec)实质上是调用了value的format(format_spec)方法。
print("i am {0},age{1}".format("tom",18))
#7.反向迭代
reversed([1,4,2,3,1])
#8.返回一个表示由 range(start, stop, step) 所指定索引集的 slice对象
a = [1,4,2,3,1]
my_slice = slice(0,5,2)
a[my_slice]
#9.排序
a = [{'name':'xiaoming','age':18,'gender':'male'},{'name':': xiaohong','age':20,'gender':'female'}]
sorted(a,key=lambda x: x['age'],reverse=False)
#10.zip
x = [3,2,1]
y = [4,5,6]
list(zip(y,x))
a = range(5)
b = list('abcde')
[str(y) + str(x) for x,y in zip(a,b)]
#10.按条件分组
def bif_by(lst, f):
return [ [x for x in lst if f(x)],[x for x in lst if not f(x)]]
records = [25,89,31,34]
bif_by(records, lambda x: x<80) # [[25, 31, 34], [89]]
画图
#Matplotlib中的日期与时间间隔图
import datetime
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
# dates for xaxis
event_date = [datetime.datetime(2008, 12, 3), datetime.datetime(2009, 1, 5), datetime.datetime(2009, 2, 3)]
# base date for yaxis can be anything, since information is in the time
anydate = datetime.date(2001,1,1)
# event times
event_start = [datetime.time(20, 12), datetime.time(12, 15), datetime.time(8, 1,)]
event_finish = [datetime.time(23, 56), datetime.time(16, 5), datetime.time(18, 34)]
# translate times and dates lists into matplotlib date format numpy arrays
start = np.fromiter((mdates.date2num(datetime.datetime.combine(anydate, event)) for event in event_start), dtype = 'float', count = len(event_start))
finish = np.fromiter((mdates.date2num(datetime.datetime.combine(anydate, event)) for event in event_finish), dtype = 'float', count = len(event_finish))
date = mdates.date2num(event_date)
# calculate events durations
duration = finish - start
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)
# use errorbar to represent event duration
ax.errorbar(date, start, [np.zeros(len(duration)), duration], linestyle = '')
# make matplotlib treat both axis as times
ax.xaxis_date()
ax.yaxis_date()
plt.show()
SCIPY
求局部极值(或者求波峰波谷)
import numpy as np
import pylab as pl
import matplotlib.pyplot as plt
import scipy.signal as signal
x=np.array([
0, 6, 25, 20, 15, 8, 15, 6, 0, 6, 0, -5, -15, -3, 4, 10, 8, 13, 8, 10, 3,
1, 20, 7, 3, 0 ])
plt.figure(figsize=(16,4))
plt.plot(np.arange(len(x)),x)
print x[signal.argrelextrema(x, np.greater)]
print signal.argrelextrema(x, np.greater)
plt.plot(signal.argrelextrema(x,np.greater)[0],x[signal.argrelextrema(x, np.greater)],'o')
plt.plot(signal.argrelextrema(-x,np.greater)[0],x[signal.argrelextrema(-x, np.greater)],'+')
# plt.plot(peakutils.index(-x),x[peakutils.index(-x)],'*')
plt.show()
————————————————
版权声明:本文为CSDN博主「weijifen000」的原创文章,遵循CC 4.0 BY-SA版权协议,转载请附上原文出处链接及本声明。
原文链接:https://blog.csdn.net/weijifen000/article/details/80070520/
[https://www.jb51.net/article/180654.htm](https://www.jb51.net/article/180654.htm)
带有AM和PM的时间转换
import datetime#字符串时间转换加变换时区
def timeutc(st):
tl=st.split(' ')
if tl[-1]=='PM':
hm=tl[-2].split(':')
h=int(hm[0])+12
m=hm[1]
s=hm[2]
else:
hm=tl[-2].split(':')
h=hm[0]
m=hm[1]
s=hm[2]
y=tl[0].split('/')[2]
mo=tl[0].split('/')[0]
d=tl[0].split('/')[1]
#如果是小时是24,因为小时只能从0-23,所以,转换到第二天的0点
if h==24:
d=str(int(d)+1)
h=0
m = '00'
s='00'
old = datetime.datetime(int(y), int(mo), int(d),int(h),int(m),int(s))#转换为datetime格式
new=old.__format__('%Y-%m-%d %H:%M:%S')#格式化输出
return new
#但是这种24点没法弄
from datetime import datetime
date_string = '2009-11-29 03:17:00 PM'
format = '%Y-%m-%d %I:%M:%S %p'
my_date = datetime.strptime(date_string, format)
https://www.cnblogs.com/fwl8888/p/9635505.html
滑动回归
#1.
from pyfinance.ols import OLS, RollingOLS, PandasRollingOLS
y = data.usd
x = data.drop('usd', axis=1)
window = 12 # months
model = PandasRollingOLS(y=y, x=x, window=window)
print(model.beta.head())
#2.
df['slope'] = df.values.rolling(window=125).apply(lambda x: np.polyfit(np.array(range(0,125)), x, 1)[0], raw=True)
#3.
[https://blog.csdn.net/weixin_30701575/article/details/97739761](https://blog.csdn.net/weixin_30701575/article/details/97739761)
#4.滑动滤波
[https://cloud.tencent.com/developer/article/1451488](https://cloud.tencent.com/developer/article/1451488)
#5.滑动窗口函数
[https://www.cnblogs.com/nxf-rabbit75/p/10669516.html](https://www.cnblogs.com/nxf-rabbit75/p/10669516.html)
#6.LSTM滑动预测
[https://blog.csdn.net/zhonglongshen/article/details/94555337](https://blog.csdn.net/zhonglongshen/article/details/94555337)
时区转换
import time
format1="%Y-%m-%d %H:%M:%S+08:00"
format2="%Y-%m-%d %H:%M:%S"
t1 = '2020-03-11 17:15:07+00:00'
t = time.strptime(t1, format1)
t2 = time.strftime(format2,t)
print(t2)
python dataframe操作
https://www.cnblogs.com/luowei93/p/11878639.html
pandas groupby agg
num_agg = {'Age':['min', 'mean', 'max']}
df.groupby('Country').agg(num_agg)
Python计算时间差
import datetime
def time_differ(date1='12:55:05',date2='13:15:05'):
date1=datetime.datetime.strptime(date1,"%H:%M:%S")
date2=datetime.datetime.strptime(date2,"%H:%M:%S")
if date1 < date2:
return date2-date1
else:
return date1-date2
date1 = '13:05:05'
date2 = '13:15:05'
differ = time_differ(date1,date2)
print(differ)
字符串时间差转换
"""
'1157 days, 9:46:39'
'12:00:01.824952'
'-1 day, 23:59:31.859767'
"""
import re
def parse(s):
if 'day' in s:
m = re.match(r'(?P[-\d]+) day[s]*, (?P\d+):(?P\d+):(?P\d[\.\d+]*)', s)
else:
m = re.match(r'(?P\d+):(?P\d+):(?P\d[\.\d+]*)', s)
return {key: float(val) for key, val in m.groupdict().iteritems()}
from datetime import timedelta
s1 = '1157 days, 9:46:39'
s2 = '12:00:01.824952'
s3 = '-1 day, 23:59:31.859767'
t1 = parse(s1)
t2 = parse(s2)
t3 = parse(s3)
timedelta(**t1) # datetime.timedelta(1157, 35199)
timedelta(**t2) # datetime.timedelta(0, 43201, 824952)
timedelta(**t3) # datetime.timedelta(-1, 86371, 859767)
##或者
import datetime
str="15:00:00"
strtime=str.split(":")
strtimedelta=datetime.timedelta(hours=int(strtime[0]),minutes=int(strtime[1]),seconds=int(strtime[2]))
SQL中 OVER(PARTITION BY) 取上一条,下一条等
https://www.cnblogs.com/zhqian/p/9140313.html
#计算某个字段在当前记录和下一条记录之间的差
select name,
hiredate,
next_hiredate,
next_hiredate - hiredate as diff
from (
select name,
hiredate,
lead(hiredate)over(order by hiredate) next_hiredate
from dataset
)
hausdorff距离##
from shapely.geometry import Polygon
point = Point(1, 1)
line = LineString([(2, 0), (2, 4), (3, 4)])
point.hausdorff_distance(line)
point.distance(Point(3, 4))
判断点是否在多边形内
http://gnss.help/2019/06/09/check-point-in-polygon/index.html
from shapely.geometry import Point
from shapely.geometry import Polygon
polygon2 = Polygon([(lon, lat) for lon, lat in points])
point = Point(122.35, 29.70)
polygon2.contains(point)
线段相交
from shapely.geometry import LineString
coords = [(0, 0), (1, 1)]
LineString(coords).crosses(LineString([(0, 1), (1, 0)]))
pywt
https://www.cnblogs.com/junge-mike/p/12761502.html
https://mne.tools/stable/auto_examples/preprocessing/plot_xdawn_denoising.html#sphx-glr-auto-examples-preprocessing-plot-xdawn-denoising-py
https://medium.com/impulse-neiry/simple-p300-classifier-on-open-data-27e906f68b83
http://europepmc.org/article/PMC/5698603
滤波
https://www.cnblogs.com/iwuqing/p/11380131.html
https://www.cnblogs.com/sunlightheart/p/12574848.html
https://www.cnblogs.com/sunlightheart/p/12542842.html
https://blog.csdn.net/shanwenkang/article/details/84345178
字符串的时间做差
#1.
pd.DataFrame(pd.to_datetime(time_df['END_TIME']) - pd.to_datetime(time_df['START_TIME']))
#2.
def time_delta(a,b):
return (datetime.strptime(a, "%Y-%m-%d %H:%M:%S")-datetime.strptime(b, "%Y-%m-%d %H:%M:%S")).total_seconds()/3600
df['delta'] = df.apply(lambda x: time_delta(x['end_postime'],x['start_postime']),axis=1)
matplotlib画图
import numpy as np
import matplotlib.pyplot as plt
y = np.arange(1,10,1)
x = np.arange(1,10,1)
bwith = 2 #边框宽度设置为2
ax = plt.gca()#获取边框
plt.tick_params(axis='both',colors='gold') #设置坐标刻度和字体颜色
ax.spines['top'].set_color('red') # 设置上边框为红色
ax.spines['right'].set_color('none') # 设置上边框为无色
ax.spines['bottom'].set_linewidth(bwith)
ax.spines['left'].set_linewidth(bwith)
ax.spines['top'].set_linewidth(bwith)
ax.spines['right'].set_linewidth(bwith)
plt.grid( color = 'black',linestyle='-.',linewidth = 1)
plt.plot(x,y)
https://blog.csdn.net/wuzlun/article/details/80059222
https://www.cnblogs.com/zhizhan/p/5615947.html
pyplot画多个图
import plotly.graph_objects as go
trace0 = go.Scatter(x=weather.index, y=weather.speed, mode='lines+markers',marker=dict(opacity=0.4), name='speed')
trace1 = go.Scatter(x=weather.index, y=weather.wind, mode='lines+markers', marker=dict(opacity=0.5),name='wind',yaxis="y2")
data1 = [trace0, trace1]
# go.Layout可以创建图层对象,实现双坐标
layout = go.Layout(title="双坐标示例图",
yaxis=dict(title="wind"),
yaxis2=dict(title="speed", overlaying='y', side="right"),legend=dict(x=0, y=1, font=dict(size=10, color="black"),orientation='h'),activeshape=dict(opacity=1))
fig = go.Figure(data=data1, layout=layout)
fig.show()
plotly画多个子图
from plotly import subplots
# 设定布局,以便进行绘图,这儿是两行一列
fig = subplots.make_subplots(rows=2,cols=1)
trace0 = go.Scatter(x=weather.index, y=weather.speed, mode='lines+markers',marker=dict(opacity=0.4), name='speed')
trace1 = go.Scatter(x=weather.index, y=weather.wind, mode='lines+markers', marker=dict(opacity=0.5),name='wind',yaxis="y2")
fig.append_trace(trace1,1,1)
fig.append_trace(trace0,2,1)
# 设定每个子图的占位情况
fig.layout.yaxis1.domain = [0.35,1.0]
fig.layout.yaxis2.domain = [0,0.3]
# 设定整个fig的大小
fig.layout.width = 800
fig.layout.height = 600
fig.show()
python读取sql
conn1 = psycopg2.connect(host="c",user="datareader",password="",port=,database="")
route1 = pd.read_sql(sql,con=conn1,parse_dates={'postime':{'origin':'unix'}})
python re正则表达式
re.findall("\d+", str) #找到字符串中的数字
re.findall("\D+", str) #找到字符串中的非数字
re.sub('\d+','',str) #替换字符串中的数字为空
两直线的夹角
https://blog.csdn.net/jizhidexiaoming/article/details/100009138
判断3个点是否为直线
https://blog.csdn.net/Changxing_J/article/details/107102182
hough变换检测直线(python)
https://blog.csdn.net/wss794/article/details/93023013
几个地理packages
https://github.com/pbrod/nvector/
https://fiona.readthedocs.io/en/latest/
pySAL
Calculating coordinates given a bearing and a distance
python绘制QQ图
import scipy.stats as st
import matplotlib.pyplot as plt
import numpy as np
n = 100
samples = st.norm.rvs(loc = 5, scale = 2, size = n)
samples_sort = sorted(samples)
x_labels_p = np.arange(1/(2*n), 1, 1/n)
y_labels_p = st.norm.cdf(samples_sort, loc = 5, scale = 2)
plt.scatter(x_labels_p, y_labels_p)
plt.title('PP plot for normal distribution samle')
plt.show()
x_labels_q = samples_sort
y_labels_q = st.norm.ppf(x_labels_p, loc = 5, scale = 2)
plt.scatter(x_labels_q, y_labels_q)
plt.title('QQ plot for normal distribution samle')
plt.show()
import statsmodels.api as sm
probplot = sm.ProbPlot(samples, dist = st.norm, loc = 5, scale = 2)
probplot.qqplot(line='45')
from scipy import stats
import numpy as np
x = np.arange(-5, 5, 0.1)
y = stats.norm.cdf(x, 0, 1)
plt.plot(x, y)
import pandas as pd
churn_raw_data = pd.read_csv('churn.txt')
day_minute = churn_raw_data['Day Mins']
sorted_ = np.sort(day_minute)
yvals = np.arange(len(sorted_))/float(len(sorted_))
plt.plot(sorted_, yvals)
x_label = stats.norm.ppf(yvals) #对目标累计分布函数值求标准正太分布累计分布函数的逆
plt.scatter(x_label, sorted_)
stats.probplot(day_minute, dist="norm", plot=plt)
plt.show()
ks检验
from scipy.stats import shapiro, kstest
k1, p1 = kstest(arr_same, 'norm')
验证是否符合泊松分布
data <- rpois(n = 100, 20)
mean <- mean(data)
poisson.test(sum(data), length(data), mean)
#P值越大说明数据符合度越好
嵌套列表展开
list_1 = [[1, 2], [3, 4, 5], [6, 7], [8], [9]]
list_2 = sum(list_1, [])
print(list_2)
找出值大于某个数的键
{k:v for k, v in test_dict.items() if v>=3}
嵌套列表计数
from collections import Counter
from itertools import chain
Counter(chain.from_iterable(test))
pandas技巧
https://mp.weixin.qq.com/s/pqJ_TvM5fnwvKzg2e2-MSQ
import pandas as pd
df ={'姓名':[' 黄同学','黄至尊','黄老邪 ','陈大美','孙尚香'],
'英文名':['Huang tong_xue','huang zhi_zun','Huang Lao_xie','Chen Da_mei','sun shang_xiang'],
'性别':['男','women','men','女','男'],
'身份证':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
'身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
'家庭住址':['湖北广水','河南信阳','广西桂林','湖北孝感','广东广州'],
'电话号码':['13434813546','19748672895','16728613064','14561586431','19384683910'],
'收入':['1.1万','8.5千','0.9万','6.5千','2.0万']}
df = pd.DataFrame(df)
df
#cat函数:用于字符串的拼接
df["姓名"].str.cat(df["家庭住址"],sep='-'*3)
#contains:判断某个字符串是否包含给定字符
df["家庭住址"].str.contains("广")
# startswith/endswith:判断某个字符串是否以…开头/结尾
df["姓名"].str.startswith("黄")
df["英文名"].str.endswith("e")
#get:获取指定位置的字符串
df["姓名"].str.get(-1)
df["身高"].str.split(":")
df["身高"].str.split(":").str.get(0)
#len:计算字符串长度
df["性别"].str.len()
#upper/lower:英文大小写转换
df["英文名"].str.upper()
df["英文名"].str.lower()
#pad+side参数/center:在字符串的左边、右边或左右两边添加给定字符
df["家庭住址"].str.pad(10,fillchar="*") # 相当于ljust()
df["家庭住址"].str.pad(10,side="right",fillchar="*") # 相当于rjust()
df["家庭住址"].str.center(10,fillchar="*")
#repeat:重复字符串几次
df["性别"].str.repeat(3)
#slice_replace:使用给定的字符串,替换指定的位置的字符
df["电话号码"].str.slice_replace(4,8,"*"*4)
#replace:将指定位置的字符,替换为给定的字符串
df["身高"].str.replace(":","-")
#replace:将指定位置的字符,替换为给定的字符串(接受正则表达式)
df["收入"].str.replace("\d+\.\d+","正则")
df
#split方法+expand参数:搭配join方法功能很强大
# 普通用法
df["身高"].str.split(":")
# split方法,搭配expand参数
df[["身高描述","final身高"]] = df["身高"].str.split(":",expand=True)
# split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)
#strip/rstrip/lstrip:去除空白符、换行符
df["姓名"].str.len()
df["姓名"] = df["姓名"].str.strip()
df["姓名"].str.len()
#findall:利用正则表达式,去字符串中匹配,返回查找结果的列表
df["身高"]
df["身高"].str.findall("[a-zA-Z]+")
#extract/extractall:接受正则表达式,抽取匹配的字符串(一定要加上括号)
df["身高"].str.extract("([a-zA-Z]+)")
# extractall提取得到复合索引
df["身高"].str.extractall("([a-zA-Z]+)")
# extract搭配expand参数
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True)
字符串转换为带时区的时间
pd.Timestamp(x,tz='Asia/ShangHai')
#或者
def time_to_CN(arrive_time):
'''utc时间转化为中国时间并转为str'''
return str(arrive_time.tz_convert('Asia/Shanghai'))
多层列表压成一层
from funcy import flatten
b = list(flatten(list))
只保留字符串中的字母和中文
import re
def remove_punctuation(mystr):
reg = "[^A-Za-z\u4e00-\u9fa5]"
new = re.sub(reg, '', mystr)
return new