性――交―性――乱免费看,看全色黄大色黄大片4033,中国一级片视频,欧美经典三级,日韩福利视频一区二区三区,考逼视频,av成人影视,亚洲日韩欧洲无码A∨夜夜爽

來源：https://github.com/SeafyLiang/Python_study?
編輯：數(shù)據(jù) STUDIO

大家好，我是 Jack~

工作中最近常用到pandas做數(shù)據(jù)處理和分析，特意總結(jié)了以下常用內(nèi)容。

pandas常用速查

引入依賴

#?導(dǎo)入模塊
import?pymysql
import?pandas?as?pd
import?numpy?as?np
import?time

#?數(shù)據(jù)庫
from?sqlalchemy?import?create_engine

#?可視化
import?matplotlib.pyplot?as?plt
#?如果你的設(shè)備是配備Retina屏幕的mac，可以在jupyter?notebook中，使用下面一行代碼有效提高圖像畫質(zhì)
%config?InlineBackend.figure_format?=?'retina'
#?解決?plt?中文顯示的問題?mymac
plt.rcParams['font.sans-serif']?=?['Arial?Unicode?MS']
#?設(shè)置顯示中文?需要先安裝字體?aistudio
plt.rcParams['font.sans-serif']?=?['SimHei']?#?指定默認(rèn)字體
plt.rcParams['axes.unicode_minus']?=?False??#?用來正常顯示負(fù)號(hào)
import?seaborn?as?sns
#?notebook渲染圖片
%matplotlib?inline
import?pyecharts

#?忽略版本問題
import?warnings
warnings.filterwarnings("ignore")??

#?下載中文字體
!wget?https://mydueros.cdn.bcebos.com/font/simhei.ttf?
#?將字體文件復(fù)制到?matplotlib'字體路徑
!cp?simhei.ttf?/opt/conda/envs/python35-paddle120-env/Lib/python3,7/site-packages/matplotib/mpl-data/fonts.

#?一般只需要將字體文件復(fù)制到系統(tǒng)字體田錄下即可,但是在?studio上該路徑?jīng)]有寫權(quán)限,所以此方法不能用?
#?!cp?simhei.?ttf?/usr/share/fonts/

#?創(chuàng)建系統(tǒng)字體文件路徑
!mkdir?.fonts
#?復(fù)制文件到該路徑
!cp?simhei.ttf?.fonts/
!rm?-rf?.cache/matplotlib

算法相關(guān)依賴

#?數(shù)據(jù)歸一化
from?sklearn.preprocessing?import?MinMaxScaler

#?kmeans聚類
from?sklearn.cluster?import?KMeans
#?DBSCAN聚類
from?sklearn.cluster?import?DBSCAN
#?線性回歸算法
from?sklearn.linear_model?import?LinearRegression
#?邏輯回歸算法
from?sklearn.linear_model?import?LogisticRegression
#?高斯貝葉斯
from?sklearn.naive_bayes?import?GaussianNB
#?劃分訓(xùn)練/測(cè)試集
from?sklearn.model_selection?import?train_test_split
#?準(zhǔn)確度報(bào)告
from?sklearn?import?metrics
#?矩陣報(bào)告和均方誤差
from?sklearn.metrics?import?classification_report,?mean_squared_error

獲取數(shù)據(jù)

from?sqlalchemy?import?create_engine
engine?=?create_engine('mysql+pymysql://root:[email protected]:3306/ry?charset=utf8')

#?查詢插入后相關(guān)表名及行數(shù)
result_query_sql?=?"use?information_schema;"
engine.execute(result_query_sql)
result_query_sql?=?"SELECT?table_name,table_rows?FROM?tables?WHERE?TABLE_NAME?LIKE?'log%%'?order?by?table_rows?desc;"
df_result?=?pd.read_sql(result_query_sql,?engine)

生成df

#?list轉(zhuǎn)df
df_result?=?pd.DataFrame(pred,columns=['pred'])
df_result['actual']?=?test_target
df_result

#?df取子df
df_new?=?df_old[['col1','col2']]

#?dict生成df
df_test?=?pd.DataFrame({'A':[0.587221,?0.135673,?0.135673,?0.135673,?0.135673],?
????????????????????????'B':['a',?'b',?'c',?'d',?'e'],
????????????????????????'C':[1,?2,?3,?4,?5]})

#?指定列名
data?=?pd.DataFrame(dataset.data,?columns=dataset.feature_names)

#?使用numpy生成20個(gè)指定分布(如標(biāo)準(zhǔn)正態(tài)分布)的數(shù)
tem?=?np.random.normal(0,?1,?20)
df3?=?pd.DataFrame(tem)

#?生成一個(gè)和df長(zhǎng)度相同的隨機(jī)數(shù)dataframe
df1?=?pd.DataFrame(pd.Series(np.random.randint(1,?10,?135)))

重命名列

#?重命名列
data_scaled?=?data_scaled.rename(columns={'本體油位':?'OILLV'})

增加列

#?df2df
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?新增一列根據(jù)salary將數(shù)據(jù)分為3組
bins?=?[0,5000,?20000,?50000]
group_names?=?['低',?'中',?'高']
df['categories']?=?pd.cut(df['salary'],?bins,?labels=group_names)

缺失值處理

#?檢查數(shù)據(jù)中是否含有任何缺失值
df.isnull().values.any()

#?查看每列數(shù)據(jù)缺失值情況
df.isnull().sum()

#?提取某列含有空值的行
df[df['日期'].isnull()]

#?輸出每列缺失值具體行數(shù)
for?i?in?df.columns:
????if?df[i].count()?!=?len(df):
????????row?=?df[i][df[i].isnull().values].index.tolist()
????????print('列名："{}", 第{}行位置有缺失值'.format(i,row))

#?眾數(shù)填充
heart_df['Thal'].fillna(heart_df['Thal'].mode(dropna=True)[0],?inplace=True)

#?連續(xù)值列的空值用平均值填充
dfcolumns?=?heart_df_encoded.columns.values.tolist()
for?item?in?dfcolumns:
????if?heart_df_encoded[item].dtype?==?'float':
???????heart_df_encoded[item].fillna(heart_df_encoded[item].median(),?inplace=True)

獨(dú)熱編碼

df_encoded?=?pd.get_dummies(df_data)

替換值

#?按列值替換
num_encode?=?{
????'AHD':?{'No':0,?"Yes":1},
}
heart_df.replace(num_encode,inplace=True)

刪除列

df_jj2.drop(['coll_time',?'polar',?'conn_type',?'phase',?'id',?'Unnamed:?0'],axis=1,inplace=True)

groupby

#?0.從sklearn加載iris數(shù)據(jù)集
from?sklearn?import?datasets
#?加載數(shù)據(jù)集和目標(biāo)
data,?target?=?datasets.load_iris(return_X_y=True,?as_frame=True)
#?合并數(shù)據(jù)集和目標(biāo)
iris?=?pd.concat([data,?target],?axis=1,?sort=False)
iris

#?創(chuàng)建groupby對(duì)象
iris_gb?=?iris.groupby('target')

#?1.?創(chuàng)建頻率表，輸出每個(gè)類中數(shù)量多少
iris_gb.size()

#?2.?計(jì)算常用的描述統(tǒng)計(jì)量
#?min、max()、medianhe、std等
#?計(jì)算均值
iris_gb.mean()
#?單列
iris_gb['sepal?length?(cm)'].mean()
#?雙列
iris_gb[['sepal?length?(cm)',?'sepal?width?(cm)']].mean()

#?3.?查找最大值（最小值）索引
iris_gb.idxmax()

#?按sepal_length最大值這個(gè)條件進(jìn)行了篩選
sepal_largest?=?iris.loc[iris_gb['sepal?length?(cm)'].idxmax()]

#?4.?Groupby之后重置索引
iris_gb.max().reset_index()
#?↑↓二者效果相同
iris.groupby('target',?as_index=False).max()

#?5.?多種統(tǒng)計(jì)量匯總，聚合函數(shù)agg
iris_gb[['sepal?length?(cm)',?'sepal?width?(cm)']].agg(["min",?"mean"])

#?6.特定列的聚合
#?為不同的列單獨(dú)設(shè)置不同的統(tǒng)計(jì)量
iris_gb.agg({"sepal?length?(cm)":?["min",?"max"],?"sepal?width?(cm)":?["mean",?"std"]})

#?7.?NamedAgg命名統(tǒng)計(jì)量
#?把每個(gè)列下面的統(tǒng)計(jì)量和列名分別合并起來?？梢允褂肗amedAgg來完成列的命名

iris_gb.agg(
?????sepal_min=pd.NamedAgg(column="sepal?length?(cm)",?aggfunc="min"),
?????sepal_max=pd.NamedAgg(column="sepal?length?(cm)",?aggfunc="max"),
?????petal_mean=pd.NamedAgg(column="petal?length?(cm)",?aggfunc="mean"),
?????petal_std=pd.NamedAgg(column="petal?length?(cm)",?aggfunc="std")
?)

#?下述更簡(jiǎn)潔
iris_gb.agg(
????sepal_min=("sepal?length?(cm)",?"min"),
????sepal_max=("sepal?length?(cm)",?"max"),
????petal_mean=("petal?length?(cm)",?"mean"),
????petal_std=("petal?length?(cm)",?"std")
)

#?8.?使用自定義函數(shù)
iris_gb.agg(pd.Series.mean)
#?不僅如此，名稱和功能對(duì)象也可一起使用。
iris_gb.agg(["min",?pd.Series.mean])
#?我們還可以自定義函數(shù)，也都是可以的。
def?double_length(x):
????return?2*x.mean()

iris_gb.agg(double_length)
#?如果想更簡(jiǎn)潔，也可以使用lambda函數(shù)。總之，用法非常靈活，可以自由組合搭配。
iris_gb.agg(lambda?x:?x.mean())

透視表

import?numpy?as?np
import?pandas?as?pd
import?seaborn?as?sns
titanic?=?sns.load_dataset('titanic')

titanic.pivot_table(index='sex',?columns='class')

#?默認(rèn)對(duì)所有列進(jìn)行聚合，這時(shí)我們給與values參數(shù)，只計(jì)算想要的結(jié)果
agg?=?pd.cut(titanic["age"],[0,18,80])?#?對(duì)年齡數(shù)據(jù)列進(jìn)行分段，便于觀看
titanic.pivot_table(index=['sex','age'],?columns='class',values=['survived','fare'])

#?在實(shí)際使用中，并不一定每次都要均值，使用aggfunc指定累計(jì)函數(shù)
titanic.pivot_table(index='sex',?columns='class',aggfunc={'survived':sum,?'fare':'mean'})

#?當(dāng)需要計(jì)算每一組的總數(shù)時(shí)，可以通過margins 參數(shù)來設(shè)置：
# margin 的標(biāo)簽可以通過margins_name 參數(shù)進(jìn)行自定義，默認(rèn)值是"All"。
titanic.pivot_table('survived',?index='sex',?columns='class',?margins=True)

數(shù)據(jù)篩選

#?取第33行數(shù)據(jù)
df.iloc[32]

#?某列以xxx字符串開頭
df_jj2?=?df_512.loc[df_512["transformer"].str.startswith('JJ2')]

df_jj2yya?=?df_jj2.loc[df_jj2["變壓器編號(hào)"]=='JJ2YYA']

#?提取第一列中不在第二列出現(xiàn)的數(shù)字
df['col1'][~df['col1'].isin(df['col2'])]

#?查找兩列值相等的行號(hào)
np.where(df.secondType?==?df.thirdType)

#?包含字符串
results?=?df['grammer'].str.contains("Python")

#?提取列名
df.columns

#?查看某列唯一值（種類）
df['education'].nunique()

#?刪除重復(fù)數(shù)據(jù)
df.drop_duplicates(inplace=True)

#?某列等于某值
df[df.col_name==0.587221]
#?df.col_name==0.587221?各行判斷結(jié)果返回值(True/False)

#?查看某列唯一值及計(jì)數(shù)
df_jj2["變壓器編號(hào)"].value_counts()

#?時(shí)間段篩選
df_jj2yyb_0501_0701?=?df_jj2yyb[(df_jj2yyb['r_time']?>=pd.to_datetime('20200501'))?&?(df_jj2yyb['r_time']?<=?pd.to_datetime('20200701'))]

#?數(shù)值篩選
df[(df['popularity']?>?3)?&?(df['popularity']?7)]

#?按數(shù)據(jù)類型選擇列
df?=?pd.DataFrame({'a':?[1,?2]?*?3,
???????????????????'b':?[True,?False]?*?3,
???????????????????'c':?[1.0,?2.0]?*?3})
print('df:',?df)

#?輸出包含?bool?數(shù)據(jù)類型的列
print('輸出包含?bool?數(shù)據(jù)類型的列:',?df.select_dtypes(include='bool'))

#?輸出包含小數(shù)數(shù)據(jù)類型的列
print('輸出包含小數(shù)數(shù)據(jù)類型的列:',?df.select_dtypes(include=['float64']))

#?輸出排除整數(shù)的列
print('輸出包含小數(shù)數(shù)據(jù)類型的列:',?df.select_dtypes(exclude=['int64']))

#?某列字符串截取
df['Time'].str[0:8]

#?隨機(jī)取num行
ins_1?=?df.sample(n=num)

#?數(shù)據(jù)去重
df.drop_duplicates(['grammer'])

#?按某列排序(降序)
df.sort_values("popularity",inplace=True,?ascending=False)

#?取某列最大值所在行
df[df['popularity']?==?df['popularity'].max()]

#?取某列最大num行
df.nlargest(num,'col_name')
#?最大num列畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

差值計(jì)算

# axis=0或index表示上下移動(dòng)， periods表示移動(dòng)的次數(shù)，為正時(shí)向下移，為負(fù)時(shí)向上移動(dòng)。
print(df.diff(?periods=1,?axis=‘index‘))
print(df.diff(?periods=-1,?axis=0))
# axis=1或columns表示左右移動(dòng)，periods表示移動(dòng)的次數(shù)，為正時(shí)向右移，為負(fù)時(shí)向左移動(dòng)。
print(df.diff(?periods=1,?axis=‘columns‘))
print(df.diff(?periods=-1,?axis=1))

#?變化率計(jì)算
data['收盤價(jià)(元)'].pct_change()

#?以5個(gè)數(shù)據(jù)作為一個(gè)數(shù)據(jù)滑動(dòng)窗口，在這個(gè)5個(gè)數(shù)據(jù)上取均值
df['收盤價(jià)(元)'].rolling(5).mean()

數(shù)據(jù)修改

#?刪除最后一行
df?=?df.drop(labels=df.shape[0]-1)

#?添加一行數(shù)據(jù)['Perl',6.6]
row?=?{'grammer':'Perl','popularity':6.6}
df?=?df.append(row,ignore_index=True)

#?某列小數(shù)轉(zhuǎn)百分?jǐn)?shù)
df.style.format({'data':?'{0:.2%}'.format})

#?反轉(zhuǎn)行
df.iloc[::-1,?:]

#?以兩列制作數(shù)據(jù)透視
pd.pivot_table(df,values=["salary","score"],index="positionId")

#?同時(shí)對(duì)兩列進(jìn)行計(jì)算
df[["salary","score"]].agg([np.sum,np.mean,np.min])

#?對(duì)不同列執(zhí)行不同的計(jì)算
df.agg({"salary":np.sum,"score":np.mean})

時(shí)間格式轉(zhuǎn)換

#?時(shí)間戳轉(zhuǎn)時(shí)間字符串
df_jj2['cTime']?=df_jj2['coll_time'].apply(lambda?x:?time.strftime("%Y-%m-%d?%H:%M:%S",?time.localtime(x)))

#?時(shí)間字符串轉(zhuǎn)時(shí)間格式
df_jj2yyb['r_time']?=?pd.to_datetime(df_jj2yyb['cTime'])

#?時(shí)間格式轉(zhuǎn)時(shí)間戳
dtime?=?pd.to_datetime(df_jj2yyb['r_time'])
v?=?(dtime.values?-?np.datetime64('1970-01-01T08:00:00Z'))?/?np.timedelta64(1,?'ms')
df_jj2yyb['timestamp']?=?v

設(shè)置索引列

df_jj2yyb_small_noise?=?df_jj2yyb_small_noise.set_index('timestamp')

折線圖

fig,?ax?=?plt.subplots()
df.plot(legend=True,?ax=ax)
plt.legend(loc=1)
plt.show()

plt.figure(figsize=(20,?6))
plt.plot(max_iter_list,?accuracy,?color='red',?marker='o',
?????????markersize=10)
plt.title('Accuracy?Vs?max_iter?Value')
plt.xlabel('max_iter?Value')
plt.ylabel('Accuracy')

散點(diǎn)圖

plt.scatter(df[:,?0],?df[:,?1],?c="red",?marker='o',?label='lable0')???
plt.xlabel('x')??
plt.ylabel('y')??
plt.legend(loc=2)??
plt.show()??

柱形圖

df?=?pd.Series(tree.feature_importances_,?index=data.columns)
#?取某列最大Num行畫橫向柱形圖
df.nlargest(10).plot(kind='barh')

熱力圖

df_corr?=?combine.corr()
plt.figure(figsize=(20,20))
g=sns.heatmap(df_corr,annot=True,cmap="RdYlGn")

66個(gè)最常用的pandas數(shù)據(jù)分析函數(shù)

df?#任何pandas?DataFrame對(duì)象?
s?#任何pandas?series對(duì)象

從各種不同的來源和格式導(dǎo)入數(shù)據(jù)

pd.read_csv(filename)?#?從CSV文件?
pd.read_table(filename)?#?從分隔的文本文件（例如CSV）中?
pd.read_excel(filename)?#?從Excel文件?
pd.read_sql(query,?connection_object)?#?從SQL表/數(shù)據(jù)庫中讀取?
pd.read_json(json_string)?#?從JSON格式的字符串，URL或文件中讀取。
pd.read_html(url)?#?解析html?URL，字符串或文件，并將表提取到數(shù)據(jù)幀列表?
pd.read_clipboard()?#?獲取剪貼板的內(nèi)容并將其傳遞給?read_table()?
pd.DataFrame(dict)?#?從字典中，列名稱的鍵，列表中的數(shù)據(jù)的值

導(dǎo)出數(shù)據(jù)

df.to_csv(filename)?#?寫入CSV文件?
df.to_excel(filename)?#?寫入Excel文件?
df.to_sql(table_name,?connection_object)?#?寫入SQL表?
df.to_json(filename)?#?以JSON格式寫入文件

創(chuàng)建測(cè)試對(duì)象

pd.DataFrame(np.random.rand(20,5))???????????????#?5列20行隨機(jī)浮點(diǎn)數(shù)?pd.Series(my_list)???????????????????????????????#?從一個(gè)可迭代的序列創(chuàng)建一個(gè)序列?my_list?
df.index?=?pd.date_range('1900/1/30',?periods=df.shape[0])?#?添加日期索引

查看、檢查數(shù)據(jù)

df.head(n)???????????????????????#?DataFrame的前n行?
df.tail(n)???????????????????????#?DataFrame的最后n行?
df.shape?????????????????????????#?行數(shù)和列數(shù)?
df.info()????????????????????????#?索引，數(shù)據(jù)類型和內(nèi)存信息?
df.describe()????????????????????#?數(shù)值列的摘要統(tǒng)計(jì)信息?
s.value_counts(dropna=False)?????#?查看唯一值和計(jì)數(shù)?
df.apply(pd.Series.value_counts)?#?所有列的唯一值和計(jì)數(shù)

數(shù)據(jù)選取

使用這些命令選擇數(shù)據(jù)的特定子集。
df[col]???????????????#?返回帶有標(biāo)簽col的列?
df[[col1,?col2]]??????#?返回列作為新的DataFrame?
s.iloc[0]?????????????#?按位置選擇?
s.loc['index_one']????#?按索引選擇?
df.iloc[0,:]??????????#?第一行?
df.iloc[0,0]??????????#?第一欄的第一元素

數(shù)據(jù)清理

df.columns?=?['a','b','c']??????????????????#?重命名列?
pd.isnull()?????????????????????????????????#?空值檢查，返回Boolean?Arrray?
pd.notnull()????????????????????????????????#?與pd.isnull()?相反?
df.dropna()?????????????????????????????????#?刪除所有包含空值的行?
df.dropna(axis=1)???????????????????????????#?刪除所有包含空值的列?
df.dropna(axis=1,thresh=n)??????????????????#?刪除所有具有少于n個(gè)非null值的行?
df.fillna(x)????????????????????????????????#?將所有空值替換為x?
s.fillna(s.mean())??????????????????????????#?用均值替換所有空值（均值可以用統(tǒng)計(jì)模塊中的幾乎所有函數(shù)替換?）?
s.astype(float)?????????????????????????????#?將系列的數(shù)據(jù)類型轉(zhuǎn)換為float?
s.replace(1,'one')??????????????????????????#?1?用?'one'?
s.replace([1,3],['one','three'])????????????#?替換所有等于的值?替換為所有1?'one'?，并?3?用?'three'?df.rename(columns=lambda?x:?x?+?1)??????????#?列的重命名?
df.rename(columns={'old_name':?'new_?name'})#?選擇性重命名?
df.set_index('column_one')??????????????????#?更改索引?
df.rename(index=lambda?x:?x?+?1)????????????#?大規(guī)模重命名索引

篩選，排序和分組依據(jù)

df[df[col]?>?0.5]??????????????????????#?列?col?大于?0.5?df[(df[col]?>?0.5)?&?(df[col]?
df.sort_values(col1)???????????????????#?按col1升序?qū)χ颠M(jìn)行排序?
df.sort_values(col2,ascending=False)???#?按col2?降序?qū)χ颠M(jìn)行?排序?
df.sort_values([col1,col2],ascending=[True,False])?#按?col1?升序排序，然后?col2?按降序排序?
df.groupby(col)????????????????????????#從一個(gè)欄返回GROUPBY對(duì)象?
df.groupby([col1,col2])?#?返回來自多個(gè)列的groupby對(duì)象?
df.groupby(col1)[col2]?????????????????#?返回中的值的平均值?col2，按中的值分組?col1?（平均值可以用統(tǒng)計(jì)模塊中的幾乎所有函數(shù)替換?）?
df.pivot_table(index=col1,values=[col2,col3],aggfunc=mean)?#?創(chuàng)建一個(gè)數(shù)據(jù)透視表組通過?col1?，并計(jì)算平均值的?col2?和?col3?
df.groupby(col1).agg(np.mean)??????????#?在所有列中找到每個(gè)唯一col1?組的平均值?
df.apply(np.mean)??????????????????????#np.mean()?在每列上應(yīng)用該函數(shù)?
df.apply(np.max,axis=1)????????????????#?np.max()?在每行上應(yīng)用功能

數(shù)據(jù)合并

df1.append(df2)???????????????????#?將df2添加?df1的末尾?（各列應(yīng)相同）?
pd.concat([df1,?df2],axis=1)??????#?將?df1的列添加到df2的末尾?（行應(yīng)相同）?
df1.join(df2,on=col1,how='inner')?# SQL樣式將列 df1 與 df2 行所在的列col 具有相同值的列連接起來。'how'可以是一個(gè)?'left'，?'right'，?'outer'，?'inner'

數(shù)據(jù)統(tǒng)計(jì)

df.describe()????#?數(shù)值列的摘要統(tǒng)計(jì)信息?
df.mean()????????#?返回均值的所有列?
df.corr()????????#?返回DataFrame中各列之間的相關(guān)性?
df.count()???????#?返回非空值的每個(gè)數(shù)據(jù)幀列中的數(shù)字?
df.max()?????????#?返回每列中的最高值?
df.min()?????????#?返回每一列中的最小值?
df.median()??????#?返回每列的中位數(shù)?
df.std()?????????#?返回每列的標(biāo)準(zhǔn)偏差

16個(gè)函數(shù)，用于數(shù)據(jù)清洗

#?導(dǎo)入數(shù)據(jù)集
import?pandas?as?pd

df?={'姓名':['?黃同學(xué)','黃至尊','黃老邪?','陳大美','孫尚香'],
?????'英文名':['Huang?tong_xue','huang?zhi_zun','Huang?Lao_xie','Chen?Da_mei','sun?shang_xiang'],
?????'性別':['男','women','men','女','男'],
?????'身份證':['463895200003128433','429475199912122345','420934199110102311','431085200005230122','420953199509082345'],
?????'身高':['mid:175_good','low:165_bad','low:159_bad','high:180_verygood','low:172_bad'],
?????'家庭住址':['湖北廣水','河南信陽','廣西桂林','湖北孝感','廣東廣州'],
?????'電話號(hào)碼':['13434813546','19748672895','16728613064','14561586431','19384683910'],
?????'收入':['1.1萬','8.5千','0.9萬','6.5千','2.0萬']}
df?=?pd.DataFrame(df)
df

1.cat函數(shù)

用于字符串的拼接

df["姓名"].str.cat(df["家庭住址"],sep='-'*3)

2.contains

判斷某個(gè)字符串是否包含給定字符

df["家庭住址"].str.contains("廣")

3.startswith/endswith

判斷某個(gè)字符串是否以…開頭/結(jié)尾

#?第一個(gè)行的“?黃偉”是以空格開頭的
df["姓名"].str.startswith("黃")?
df["英文名"].str.endswith("e")

4.count

計(jì)算給定字符在字符串中出現(xiàn)的次數(shù)

df["電話號(hào)碼"].str.count("3")

5.get

獲取指定位置的字符串

df["姓名"].str.get(-1)
df["身高"].str.split(":")df["身高"].str.split(":").str.get(0)

6.len

計(jì)算字符串長(zhǎng)度

df["性別"].str.len()

7.upper/lower

英文大小寫轉(zhuǎn)換

df["英文名"].str.upper()
df["英文名"].str.lower()

8.pad+side參數(shù)/center

在字符串的左邊、右邊或左右兩邊添加給定字符

df["家庭住址"].str.pad(10,fillchar="*")??????#?相當(dāng)于ljust()
df["家庭住址"].str.pad(10,side="right",fillchar="*")????#?相當(dāng)于rjust()
df["家庭住址"].str.center(10,fillchar="*")

9.repeat

重復(fù)字符串幾次

df["性別"].str.repeat(3)

10.slice_replace

使用給定的字符串，替換指定的位置的字符

df["電話號(hào)碼"].str.slice_replace(4,8,"*"*4)

11.replace

將指定位置的字符，替換為給定的字符串

df["身高"].str.replace(":","-")

12.replace

將指定位置的字符，替換為給定的字符串(接受正則表達(dá)式)

replace中傳入正則表達(dá)式，才叫好用；
先不要管下面這個(gè)案例有沒有用，你只需要知道，使用正則做數(shù)據(jù)清洗多好用；

df["收入"].str.replace("\d+\.\d+","正則")

13.split方法+expand參數(shù)

搭配join方法功能很強(qiáng)大

#?普通用法
df["身高"].str.split(":")
#?split方法，搭配expand參數(shù)
df[["身高描述","final身高"]]?=?df["身高"].str.split(":",expand=True)
df
#?split方法搭配join方法
df["身高"].str.split(":").str.join("?"*5)

14.strip/rstrip/lstrip

去除空白符、換行符

df["姓名"].str.len()
df["姓名"]?=?df["姓名"].str.strip()
df["姓名"].str.len()

15.findall

利用正則表達(dá)式，去字符串中匹配，返回查找結(jié)果的列表

findall使用正則表達(dá)式，做數(shù)據(jù)清洗，真的很香！

df["身高"]
df["身高"].str.findall("[a-zA-Z]+")

16.extract/extractall

接受正則表達(dá)式，抽取匹配的字符串(一定要加上括號(hào))

df["身高"].str.extract("([a-zA-Z]+)")
#?extractall提取得到復(fù)合索引
df["身高"].str.extractall("([a-zA-Z]+)")
#?extract搭配expand參數(shù)
df["身高"].str.extract("([a-zA-Z]+).*?([a-zA-Z]+)",expand=True)

·················END·················

1w 字 pands 核心操作手冊(cè),爆肝整理!