企业🤖AI智能体构建引擎,智能编排和调试,一键部署,支持私有化部署方案 广告
[TOC] # 构造DataFrame ~~~ import pandas as pd data = pd.DataFrame({'group': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], 'data': [4, 3, 2, 1, 12, 3, 4, 5, 7]}) print(data) ~~~ 输出 ~~~ data group 0 4 a 1 3 a 2 2 a 3 1 b 4 12 b 5 3 b 6 4 c 7 5 c 8 7 c ~~~ # 指定某列升序,某列降序 ~~~ import pandas as pd data = pd.DataFrame({'group': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], 'data': [4, 3, 2, 1, 12, 3, 4, 5, 7]}) # group降序,data升序,inplace是指在原始数据上做改变 data.sort_values(by=['group', 'data'], ascending=[False, True], inplace=True) print(data) ~~~ 输出 ~~~ data group 6 4 c 7 5 c 8 7 c 3 1 b 5 3 b 4 12 b 2 2 a 1 3 a 0 4 a ~~~ # 构造DataFrame ~~~ data = pd.DataFrame({'k1':['one']*3+['two']*4, 'k2':[3,2,1,3,3,4,4]}) ~~~ 按k2排序 ~~~ import pandas as pd data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [3, 2, 1, 3, 3, 4, 4]}) dt = data.sort_values(by='k2') print(dt) ~~~ # 删除重复值 **2个列都重复才删掉** ~~~ import pandas as pd data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [3, 2, 1, 3, 3, 4, 4]}) # 2个列都重复才删掉 dt = data.drop_duplicates() print(dt) ~~~ 输出 ~~~ k1 k2 0 one 3 1 one 2 2 one 1 3 two 3 5 two 4 ~~~ **按一列只要重复就删掉** ~~~ import pandas as pd data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [3, 2, 1, 3, 3, 4, 4]}) # 按一列只要重复就删掉 dt = data.drop_duplicates(subset='k1') print(dt) ~~~ 输出 ~~~ k1 k2 0 one 3 3 two 3 ~~~ # 构造DataFrame ~~~ data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]}) ~~~ ![](https://box.kancloud.cn/827943ab2e5da25b6c3d2050e5a53da0_102x211.png) # 对数据每行每列做同样的事情 ~~~ import pandas as pd data = pd.DataFrame({'food': ['A1', 'A2', 'B1', 'B2', 'B3', 'C1', 'C2'], 'data': [1, 2, 3, 4, 5, 6, 7]}) # A1,A2,A3都归为A def food_map(series): if series['food'] == 'A1': return 'A' elif series['food'] == 'A2': return 'A' elif series['food'] == 'B1': return 'B' elif series['food'] == 'B2': return 'B' elif series['food'] == 'B3': return 'B' elif series['food'] == 'C1': return 'C' elif series['food'] == 'C2': return 'C' # 增加一列 data['food_map'] = data.apply(food_map, axis='columns') print(data) ~~~ 输出 ~~~ data food food_map 0 1 A1 A 1 2 A2 A 2 3 B1 B 3 4 B2 B 4 5 B3 B 5 6 C1 C 6 7 C2 C ~~~ 也可以用字典做 ~~~ food2Upper = { 'A1':'A', 'A2':'A', 'B1':'B', 'B2':'B', 'B3':'B', 'C1':'C', 'C2':'C' } data['upper'] = data['food'].map(food2Upper) ~~~ # 构造几行,并加一个列 构造5行,并增加一列为每行的比值,你也可以做其他运算 ~~~ import numpy as np df = pd.DataFrame({'data1':np.random.randn(5), 'data2':np.random.randn(5)}) df2 = df.assign(ration = df['data1']/df['data2']) ~~~ 输出 ~~~ data1 data2 ration 0 -0.892035 0.653509 -1.364992 1 1.334756 -0.053562 -24.919813 2 0.461612 -1.638151 -0.281788 3 -2.489553 -1.171239 2.125572 4 -1.043313 0.046131 -22.616422 ~~~ # 删除列 ~~~ import numpy as np import pandas as pd df = pd.DataFrame({'data1': np.random.randn(5), 'data2': np.random.randn(5)}) df2 = df.assign(ration=df['data1'] / df['data2']) df2.drop('ration', axis='columns', inplace=True) print(df2) ~~~ 输出 ~~~ data1 data2 0 0.294873 0.809813 1 0.074282 -1.940564 2 -0.261030 -2.511745 3 -0.888109 0.406030 4 0.576189 0.582593 ~~~ # 替换值 ~~~ import pandas as pd import numpy as np data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9]) # 把9这个值替换为NaN data.replace(9, np.nan, inplace=True) print(data) ~~~ 输出 ~~~ 0 1.0 1 2.0 2 3.0 3 4.0 4 5.0 5 6.0 6 7.0 7 8.0 8 NaN dtype: float64 ~~~ # 数据切分,映射到区间 ~~~ import pandas as pd ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79] bins = [10, 40, 80] # 以bins为区间切分 bins_res = pd.cut(ages, bins) print(bins_res) ~~~ 输出 ~~~ [(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]] Categories (2, interval[int64]): [(10, 40] < (40, 80]] ~~~ # 统计每个区间的个数 ~~~ import pandas as pd ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79] bins = [10, 40, 80] # 以bins为区间切分 bins_res = pd.cut(ages, bins) # 统计每个区间的个数 counts = pd.value_counts(bins_res) print(counts) ~~~ 输出 ~~~ (10, 40] 6 (40, 80] 4 dtype: int64 ~~~ 也可以把bins传进来 ~~~ pd.cut(ages,[10,30,50,80]) ~~~ **用别名代替** ~~~ import pandas as pd ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79] # 用别名代替区间 group_names = ['Yonth', 'Mille', 'Old'] counts = pd.value_counts(pd.cut(ages, [10, 20, 50, 80], labels=group_names)) print(counts) ~~~ 输出 ~~~ Mille 4 Old 3 Yonth 3 dtype: int64 ~~~ # 找出有缺失值 ![](https://box.kancloud.cn/37e8db018514194025b500cb30bc4c8d_119x127.png) ~~~ import pandas as pd import numpy as np df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)]) # bool判断有没有缺失值 print(df.isnull()) ~~~ 输出 ~~~ 0 1 2 0 False False False 1 False True False 2 False False True 3 False False False ~~~ # 按列和行看是不是null ~~~ import pandas as pd import numpy as np df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)]) print(df.isnull().any(axis = 1)) ~~~ 输出 ~~~ 0 False 1 True 2 True 3 False dtype: bool ~~~ # 有缺失值用什么值填充 ~~~ import pandas as pd import numpy as np df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)]) fillna = df.fillna(5) print(fillna) ~~~ 输出 ~~~ 0 1 2 0 0 1.0 2.0 1 0 5.0 0.0 2 0 0.0 5.0 3 0 1.0 2.0 ~~~ # 找到缺失值 ~~~ import pandas as pd import numpy as np df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)]) dt = df[df.isnull().any(axis=1)] print(dt) ~~~ 输出 ~~~ 0 1 2 1 0 NaN 0.0 2 0 0.0 NaN ~~~