Pandas Dataframe Groupby Based On Condition
The most similar question I found was here but with no proper answer. Basically I have an issue where I'm trying to use groupby on a dataframe to generate unique IDs for bus routes
Solution 1:
One idea is to factorize via np.select
, then use a custom loop via numba
:
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
@njit
def grouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0for i in range(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1
elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0return res
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
Result:
print(df)
Position Vehicle_ID Group
0 START A11 MID A12 MID A13 END A14 MID A05 START A26 START A27 MID A28 MID A29 END A210 MID A011 START A312 START A313 MID A314 MID A315 END A316 MID A017 START A4
In my opinion, you should not include "blank" values as this would force your series to be object
dtype, inefficient for any subsequent processing. As above, you can use 0
instead.
Performance benchmarking
numba
is around ~10x faster than one pure Pandas approach:-
import pandas as pd, numpy as np
from numba import njit
df = pd.DataFrame({'Vehicle_ID': ['A']*18,
'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})
df = pd.concat([df]*10, ignore_index=True)
assert joz(df.copy()).equals(jpp(df.copy()))
%timeit joz(df.copy()) # 18.6 ms per loop
%timeit jpp(df.copy()) # 1.95 ms per loop
Benchmarking functions:
defjoz(df):
# identification of sequences
df['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()
# merge results to one dataframe
df = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(['Position_Prev', 'Sequence', 'Sequence_Sum'], axis=1, inplace=True)
return df
@njitdefgrouper(pos):
res = np.empty(pos.shape)
num = 1
started = 0for i inrange(len(res)):
current_pos = pos[i]
if (started == 0) and (current_pos == 0):
started = 1
res[i] = num
elif (started == 1) and (current_pos == 1):
started = 0
res[i] = num
num += 1elif (started == 1) and (current_pos in [-1, 0]):
res[i] = num
else:
res[i] = 0return res
defjpp(df):
arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
df['Group'] = grouper(arr).astype(int)
return df
Solution 2:
I have some solution. You have to avoid loops and try to using sliding, slicing and merging.
This is my first prototype (should be refactored)
# identification of sequencesdf['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1
# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()
# merge results to one dataframedf = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(columns=['Position_Prev', 'Sequence', 'Sequence_Sum'], inplace=True)
df
Result:
Vehicle_ID Position Group
0A START 11A MID 12A MID 13A END 14A MID 05A START 26A START 27A MID 28A MID 29A END 210A MID 011A START 312A START 313A MID 314A MID 315A END 316A MID 017A START 4
Post a Comment for "Pandas Dataframe Groupby Based On Condition"