[TOC]
# 构造DataFrame
~~~
import pandas as pd
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], 'data': [4, 3, 2, 1, 12, 3, 4, 5, 7]})
print(data)
~~~
输出
~~~
data group
0 4 a
1 3 a
2 2 a
3 1 b
4 12 b
5 3 b
6 4 c
7 5 c
8 7 c
~~~
# 指定某列升序,某列降序
~~~
import pandas as pd
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'], 'data': [4, 3, 2, 1, 12, 3, 4, 5, 7]})
# group降序,data升序,inplace是指在原始数据上做改变
data.sort_values(by=['group', 'data'], ascending=[False, True], inplace=True)
print(data)
~~~
输出
~~~
data group
6 4 c
7 5 c
8 7 c
3 1 b
5 3 b
4 12 b
2 2 a
1 3 a
0 4 a
~~~
# 构造DataFrame
~~~
data = pd.DataFrame({'k1':['one']*3+['two']*4,
'k2':[3,2,1,3,3,4,4]})
~~~
按k2排序
~~~
import pandas as pd
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [3, 2, 1, 3, 3, 4, 4]})
dt = data.sort_values(by='k2')
print(dt)
~~~
# 删除重复值
**2个列都重复才删掉**
~~~
import pandas as pd
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [3, 2, 1, 3, 3, 4, 4]})
# 2个列都重复才删掉
dt = data.drop_duplicates()
print(dt)
~~~
输出
~~~
k1 k2
0 one 3
1 one 2
2 one 1
3 two 3
5 two 4
~~~
**按一列只要重复就删掉**
~~~
import pandas as pd
data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
'k2': [3, 2, 1, 3, 3, 4, 4]})
# 按一列只要重复就删掉
dt = data.drop_duplicates(subset='k1')
print(dt)
~~~
输出
~~~
k1 k2
0 one 3
3 two 3
~~~
# 构造DataFrame
~~~
data = pd.DataFrame({'food':['A1','A2','B1','B2','B3','C1','C2'],'data':[1,2,3,4,5,6,7]})
~~~
![](https://box.kancloud.cn/827943ab2e5da25b6c3d2050e5a53da0_102x211.png)
# 对数据每行每列做同样的事情
~~~
import pandas as pd
data = pd.DataFrame({'food': ['A1', 'A2', 'B1', 'B2', 'B3', 'C1', 'C2'], 'data': [1, 2, 3, 4, 5, 6, 7]})
# A1,A2,A3都归为A
def food_map(series):
if series['food'] == 'A1':
return 'A'
elif series['food'] == 'A2':
return 'A'
elif series['food'] == 'B1':
return 'B'
elif series['food'] == 'B2':
return 'B'
elif series['food'] == 'B3':
return 'B'
elif series['food'] == 'C1':
return 'C'
elif series['food'] == 'C2':
return 'C'
# 增加一列
data['food_map'] = data.apply(food_map, axis='columns')
print(data)
~~~
输出
~~~
data food food_map
0 1 A1 A
1 2 A2 A
2 3 B1 B
3 4 B2 B
4 5 B3 B
5 6 C1 C
6 7 C2 C
~~~
也可以用字典做
~~~
food2Upper = {
'A1':'A',
'A2':'A',
'B1':'B',
'B2':'B',
'B3':'B',
'C1':'C',
'C2':'C'
}
data['upper'] = data['food'].map(food2Upper)
~~~
# 构造几行,并加一个列
构造5行,并增加一列为每行的比值,你也可以做其他运算
~~~
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5),
'data2':np.random.randn(5)})
df2 = df.assign(ration = df['data1']/df['data2'])
~~~
输出
~~~
data1 data2 ration
0 -0.892035 0.653509 -1.364992
1 1.334756 -0.053562 -24.919813
2 0.461612 -1.638151 -0.281788
3 -2.489553 -1.171239 2.125572
4 -1.043313 0.046131 -22.616422
~~~
# 删除列
~~~
import numpy as np
import pandas as pd
df = pd.DataFrame({'data1': np.random.randn(5),
'data2': np.random.randn(5)})
df2 = df.assign(ration=df['data1'] / df['data2'])
df2.drop('ration', axis='columns', inplace=True)
print(df2)
~~~
输出
~~~
data1 data2
0 0.294873 0.809813
1 0.074282 -1.940564
2 -0.261030 -2.511745
3 -0.888109 0.406030
4 0.576189 0.582593
~~~
# 替换值
~~~
import pandas as pd
import numpy as np
data = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9])
# 把9这个值替换为NaN
data.replace(9, np.nan, inplace=True)
print(data)
~~~
输出
~~~
0 1.0
1 2.0
2 3.0
3 4.0
4 5.0
5 6.0
6 7.0
7 8.0
8 NaN
dtype: float64
~~~
# 数据切分,映射到区间
~~~
import pandas as pd
ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79]
bins = [10, 40, 80]
# 以bins为区间切分
bins_res = pd.cut(ages, bins)
print(bins_res)
~~~
输出
~~~
[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (10, 40], (40, 80], (40, 80], (40, 80], (40, 80]]
Categories (2, interval[int64]): [(10, 40] < (40, 80]]
~~~
# 统计每个区间的个数
~~~
import pandas as pd
ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79]
bins = [10, 40, 80]
# 以bins为区间切分
bins_res = pd.cut(ages, bins)
# 统计每个区间的个数
counts = pd.value_counts(bins_res)
print(counts)
~~~
输出
~~~
(10, 40] 6
(40, 80] 4
dtype: int64
~~~
也可以把bins传进来
~~~
pd.cut(ages,[10,30,50,80])
~~~
**用别名代替**
~~~
import pandas as pd
ages = [15, 18, 20, 21, 22, 34, 41, 52, 63, 79]
# 用别名代替区间
group_names = ['Yonth', 'Mille', 'Old']
counts = pd.value_counts(pd.cut(ages, [10, 20, 50, 80], labels=group_names))
print(counts)
~~~
输出
~~~
Mille 4
Old 3
Yonth 3
dtype: int64
~~~
# 找出有缺失值
![](https://box.kancloud.cn/37e8db018514194025b500cb30bc4c8d_119x127.png)
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
# bool判断有没有缺失值
print(df.isnull())
~~~
输出
~~~
0 1 2
0 False False False
1 False True False
2 False False True
3 False False False
~~~
# 按列和行看是不是null
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
print(df.isnull().any(axis = 1))
~~~
输出
~~~
0 False
1 True
2 True
3 False
dtype: bool
~~~
# 有缺失值用什么值填充
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
fillna = df.fillna(5)
print(fillna)
~~~
输出
~~~
0 1 2
0 0 1.0 2.0
1 0 5.0 0.0
2 0 0.0 5.0
3 0 1.0 2.0
~~~
# 找到缺失值
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame([range(3), [0, np.nan, 0], [0, 0, np.nan], range(3)])
dt = df[df.isnull().any(axis=1)]
print(dt)
~~~
输出
~~~
0 1 2
1 0 NaN 0.0
2 0 0.0 NaN
~~~