[TOC]
# 准备数据
~~~
import pandas as pd
frame = pd.DataFrame(
{'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
print(frame)
~~~
输出
~~~
data key
0 0 A
1 5 B
2 10 C
3 5 A
4 10 B
5 15 C
6 10 A
7 15 B
8 20 C
~~~
# 求和
求A,B,C品牌,销售额都是多少
传统做法
~~~
import pandas as pd
frame = pd.DataFrame(
{'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
for key in ['A', 'B', 'C']:
print(key, frame[frame['key'] == key].sum())
~~~
分组
~~~
import pandas as pd
frame = pd.DataFrame(
{'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
key__sum = frame.groupby('key').sum()
print(key__sum)
~~~
输出
~~~
data
key
A 15
B 30
C 45
~~~
# 求平均值
这3个品牌的平均值
~~~
import pandas as pd
import numpy as np
frame = pd.DataFrame(
{'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C', ], 'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]})
aggregate = frame.groupby('key').aggregate(np.mean)
print(aggregate)
~~~
输出
~~~
data
key
A 5
B 10
C 15
~~~
**按性别分组求年龄的平均值**
~~~
import pandas as pd
import numpy as np
csv = pd.read_csv('./titanic.csv')
mean = csv.groupby('Sex')['Age'].mean()
print(mean)
~~~
输出
~~~
Sex
female 27.915709
male 30.726645
Name: Age, dtype: float64
~~~
# 按某个分组统计
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
# 按A分组统计
groupby = df.groupby('A')
print(groupby.count())
~~~
输出
~~~
B C D
A
bar 3 3 3
foo 5 5 5
~~~
也可以指定多个字段
~~~
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
# 按A分组统计
groupby = df.groupby(['A', 'B'])
print(groupby.count())
~~~
输出
~~~
C D
A B
bar one 1 1
three 1 1
two 1 1
foo one 2 2
three 1 1
two 2 2
~~~
**传进来的字母小写在...里面就a,否则就b**
~~~
def get_letter_type(letter):
if letter.lower() in 'aeiou':
return 'a'
else:
return 'b'
grouped = df.groupby(get_letter_type,axis = 1)
grouped.count().iloc[0]
~~~
输出
~~~
a 1
b 3
Name: 0, dtype: int64
~~~
# 打印重复值
实际上是做了排序
~~~
s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5])
grouped = s.groupby(level=0)
print(s)
print(grouped.first())
print(grouped.last())
~~~
输出
~~~
8 1
7 2
5 3
8 1
7 2
5 3
dtype: int64
5 3
7 2
8 1
dtype: int64
5 3
7 2
8 1
dtype: int64
~~~
# 求和
~~~
import pandas as pd
import numpy as np
s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5])
grouped = s.groupby(level=0)
print(grouped.sum())
~~~
# 排序
~~~
s = pd.Series([1, 2, 3, 1, 2, 3], [8, 7, 5, 8, 7, 5])
grouped = s.groupby(level=0, sort=True)
print(grouped.first())
~~~
# 分组后关注某个值
~~~
df2 = pd.DataFrame({'x': ['A', 'B', 'A', 'B'], 'Y': [1, 2, 3, 4]})
print(df2.groupby(['x']).get_group('A'))
~~~
输出
~~~
x Y
0 A 1
2 A 3
~~~
# 构造多重索引
~~~
arrays = [
['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']
]
# 设置2个索引
index = pd.MultiIndex.from_arrays(arrays, names=['first', 'second'])
print(index)
s = pd.Series(np.random.randn(8), index=index)
print(s)
~~~
输出
~~~
MultiIndex(levels=[['bar', 'baz', 'foo', 'qux'], ['one', 'two']],
codes=[[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]],
names=['first', 'second'])
first second
bar one 0.613790
two 1.432133
baz one 1.194075
two 1.863554
foo one -0.742547
two 0.228419
qux one 0.065728
two -2.212572
dtype: float64
~~~
取第一列求和
~~~
# level = 'first'这样写也可以
groupby = s.groupby(level=0)
print(groupby.sum())
~~~
# 以某些列为键,求和
顺便指定索引
~~~
df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B': ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C': np.random.randn(8),
'D': np.random.randn(8)})
grouped = df.groupby(['A', 'B'])
aggregate = grouped.aggregate(np.sum)
print(aggregate)
~~~
输出
~~~
C D
A B
bar one 0.832853 1.304246
three 1.510920 -0.056116
two 1.096638 -0.830680
foo one 0.642804 0.215040
three 0.228107 -1.037871
two -2.145560 0.015041
~~~
设置不指定索引
~~~
grouped = df.groupby(['A', 'B'], as_index=False)
aggregate = grouped.aggregate(np.sum)
print(aggregate)
~~~
输出
~~~
A B C D
0 bar one 0.659134 0.035910
1 bar three 0.873216 -0.432892
2 bar two -0.054760 -0.634816
3 foo one -0.662632 0.543061
4 foo three 0.043405 -1.381792
5 foo two -0.561851 -1.262610
~~~
# 重新构建索引
~~~
grouped = df.groupby(['A', 'B']).sum().reset_index()
print(grouped)
~~~
输出
~~~
A B C D
0 bar one -1.154648 -0.809538
1 bar three -0.623831 1.076197
2 bar two -0.496645 0.879416
3 foo one -0.439856 -0.279243
4 foo three 0.688289 -1.738350
5 foo two 0.832524 -0.520591
~~~
# 查看出现的次数
~~~
grouped = df.groupby(['A', 'B'])
print(grouped.size())
~~~
输出
~~~
A B
bar one 1
three 1
two 1
foo one 2
three 1
two 2
dtype: int64
~~~
# 查看统计信息
~~~
grouped = df.groupby(['A', 'B'])
print(grouped.describe())
~~~
输出
~~~
C ... D
count mean std ... 50% 75% max
A B ...
bar one 1.0 -1.105223 NaN ... -2.206875 -2.206875 -2.206875
three 1.0 0.585855 NaN ... -0.411104 -0.411104 -0.411104
two 1.0 -0.524709 NaN ... 0.516504 0.516504 0.516504
foo one 2.0 -0.223097 0.579265 ... -0.257382 -0.191427 -0.125473
three 1.0 -0.369408 NaN ... -1.330020 -1.330020 -1.330020
two 2.0 -1.060815 0.384927 ... 0.482748 0.933142 1.383536
~~~
# 查看部分统计信息
以A为分组,统计里面C的信息
~~~
grouped = df.groupby('A')
print(grouped['C'].agg([np.sum, np.mean, np.std]))
~~~
输出
~~~
sum mean std
A
bar -2.574704 -0.858235 0.975134
foo 0.429849 0.085970 0.901520
~~~
指定别名
~~~
grouped = df.groupby('A')
print(grouped['C'].agg({'res_sum': np.sum, 'res_mean': np.mean, 'res_std': np.std}))
~~~