鉴于此设置:
import pandas as pd
import datetime as DT
df = pd.DataFrame({
'Buyer': 'Carl Mark Carl Joe Joe Carl'.split(),
'Product': list('ABAABA'),
'Quantity': [5,2,5,10,1,5]
}, index=[
DT.datetime(2013,1,1,13,0),
DT.datetime(2013,1,1,13,5),
DT.datetime(2013,1,1,20,0),
DT.datetime(2013,1,2,10,0),
DT.datetime(2013,1,2,12,0),
DT.datetime(2013,1,2,14,0),
])
print(df)
# Buyer Product Quantity
# 2013-01-01 13:00:00 Carl A 5
# 2013-01-01 13:05:00 Mark B 2
# 2013-01-01 20:00:00 Carl A 5
# 2013-01-02 10:00:00 Joe A 10
# 2013-01-02 12:00:00 Joe B 1
# 2013-01-02 14:00:00 Carl A 5
客户每天在该部门呆了多长时间(第一次购买时间 - 最后一次购买时间)?如果仅让我们假设 1h。
def lingertime(df):
dates = df.index.map(lambda d: d.date())
def linger(grp):
dates = grp.index
x = (dates.max()-dates.min())
return x or DT.timedelta(hours=1)
return df.groupby([dates, 'Buyer']).apply(linger)
print(lingertime(df))
# date Buyer
# 2013-01-01 Carl 7:00:00
# Mark 1:00:00
# 2013-01-02 Carl 1:00:00
# Joe 2:00:00
直到中午和午夜,每天最畅销的产品是什么?
def product_quantity(df, from_hour, to_hour):
df_timeslice = df.ix[
df.index.indexer_between_time(
DT.time(from_hour), DT.time(to_hour),
include_start=True, include_end=False)]
# print(df_timeslice)
# Buyer Product Quantity
# 2013-01-02 10:00:00 Joe A 10
# 2013-01-02 12:00:00 Joe B 1
return df_timeslice.groupby('Product').sum().sort(['Quantity'], ascending=False)
print(product_quantity(df, 0, 12))
# Quantity
# Product
# A 10
print(product_quantity(df, 12, 0))
# Quantity
# Product
# A 15
# B 3
如何在不使用 avg 而是使用自定义函数的情况下计算平均采购数量等组间统计数据?
def average_quantity_per_product(df):
def myavg(grp):
return grp['Quantity'].mean()
return df.groupby('Product').apply(myavg)
print(average_quantity_per_product(df))
# Product
# A 6.25
# B 1.50
要将一位买家与按天分组的其他买家进行比较:
def compare_buyers_with(df, name):
def compare(grp):
groups = grp.groupby('Buyer')
total = groups['Quantity'].sum()
return total-total.get(name, 0)
dates = df.index.map(lambda d: d.date())
return df.groupby([dates]).apply(compare)
print(compare_buyers_with(df, 'Carl'))
# Buyer
# 2013-01-01 Carl 0
# Mark -8
# 2013-01-02 Carl 0
# Joe 6
# Name: Quantity
要查找产品未售出的日期:
def days_when_not_sold(df, name):
dates = df.index.map(lambda d: d.date())
def not_in(grp):
return not np.any(name == grp['Product'])
sales = df.groupby([dates]).apply(not_in)
return sales.index.values[sales]
print(days_when_not_sold(df, 'A'))
# []
print(days_when_not_sold(df, 'C'))
# [2013-01-01 2013-01-02]