Pandas Dataframe Groupby Based On Condition

December 05, 2023 Post a Comment

The most similar question I found was here but with no proper answer. Basically I have an issue where I'm trying to use groupby on a dataframe to generate unique IDs for bus routes

Solution 1:

One idea is to factorize via np.select, then use a custom loop via numba:

from numba import njit

df = pd.DataFrame({'Vehicle_ID': ['A']*18,
                   'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})

@njit
def grouper(pos):
    res = np.empty(pos.shape)
    num = 1
    started = 0for i in range(len(res)):
        current_pos = pos[i]
        if (started == 0) and (current_pos == 0):
            started = 1
            res[i] = num
        elif (started == 1) and (current_pos == 1):
            started = 0
            res[i] = num
            num += 1
        elif (started == 1) and (current_pos in [-1, 0]):
            res[i] = num
        else:
            res[i] = 0return res

arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)

df['Group'] = grouper(arr).astype(int)

Result:

print(df)

   Position Vehicle_ID  Group
0     START          A11       MID          A12       MID          A13       END          A14       MID          A05     START          A26     START          A27       MID          A28       MID          A29       END          A210      MID          A011    START          A312    START          A313      MID          A314      MID          A315      END          A316      MID          A017    START          A4

In my opinion, you should not include "blank" values as this would force your series to be object dtype, inefficient for any subsequent processing. As above, you can use 0 instead.

Performance benchmarking

numba is around ~10x faster than one pure Pandas approach:-

import pandas as pd, numpy as np
from numba import njit

df = pd.DataFrame({'Vehicle_ID': ['A']*18,
                   'Position': ['START', 'MID', 'MID', 'END', 'MID', 'START']*3})


df = pd.concat([df]*10, ignore_index=True)

assert joz(df.copy()).equals(jpp(df.copy()))

%timeit joz(df.copy())  # 18.6 ms per loop
%timeit jpp(df.copy())  # 1.95 ms per loop

Benchmarking functions:

defjoz(df):
    # identification of sequences
    df['Position_Prev'] = df['Position'].shift(1)
    df['Sequence'] = 0
    df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
    df.loc[df['Position'] == 'END', 'Sequence'] = -1
    df['Sequence_Sum'] = df['Sequence'].cumsum()
    df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1# take only items between START and END and generate Group number
    df2 = df[df['Sequence_Sum'] == 1].copy()
    df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
    df2['Group'] = df2['Sequence'].cumsum()

    # merge results to one dataframe
    df = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
    df['Group'] = df['Group'].fillna(0)
    df['Group'] = df['Group'].astype(int)
    df.drop(['Position_Prev', 'Sequence', 'Sequence_Sum'], axis=1, inplace=True)    
    return df

@njitdefgrouper(pos):
    res = np.empty(pos.shape)
    num = 1
    started = 0for i inrange(len(res)):
        current_pos = pos[i]
        if (started == 0) and (current_pos == 0):
            started = 1
            res[i] = num
        elif (started == 1) and (current_pos == 1):
            started = 0
            res[i] = num
            num += 1elif (started == 1) and (current_pos in [-1, 0]):
            res[i] = num
        else:
            res[i] = 0return res

defjpp(df):
    arr = np.select([df['Position'].eq('START'), df['Position'].eq('END')], [0, 1], -1)
    df['Group'] = grouper(arr).astype(int)
    return df

Solution 2:

I have some solution. You have to avoid loops and try to using sliding, slicing and merging.

This is my first prototype (should be refactored)

# identification of sequencesdf['Position_Prev'] = df['Position'].shift(1)
df['Sequence'] = 0
df.loc[(df['Position'] == 'START') & (df['Position_Prev'] != 'START'), 'Sequence'] = 1
df.loc[df['Position'] == 'END', 'Sequence'] = -1
df['Sequence_Sum'] = df['Sequence'].cumsum()
df.loc[df['Sequence'] == -1, 'Sequence_Sum'] = 1

# take only items between START and END and generate Group number
df2 = df[df['Sequence_Sum'] == 1].copy()
df2.loc[df['Sequence'] == -1, 'Sequence'] = 0
df2['Group'] = df2['Sequence'].cumsum()

# merge results to one dataframedf = df.merge(df2[['Group']], left_index=True, right_index=True, how='left')
df['Group'] = df['Group'].fillna(0)
df['Group'] = df['Group'].astype(int)
df.drop(columns=['Position_Prev', 'Sequence', 'Sequence_Sum'], inplace=True)
df

Result:

Vehicle_ID Position  Group
0A    START      11A      MID      12A      MID      13A      END      14A      MID      05A    START      26A    START      27A      MID      28A      MID      29A      END      210A      MID      011A    START      312A    START      313A      MID      314A      MID      315A      END      316A      MID      017A    START      4

Python Channel

Pandas Dataframe Groupby Based On Condition

Solution 1:

Performance benchmarking

Solution 2:

Post a Comment for "Pandas Dataframe Groupby Based On Condition"