Bonus: Advanced Aggregation
Advanced GroupBy Operations
Section titled “Advanced GroupBy Operations”Custom Aggregation Functions
Section titled “Custom Aggregation Functions”Reference:
import pandas as pdimport numpy as np
# Custom aggregation functiondef custom_agg(series): """Custom aggregation that returns multiple values""" return pd.Series({ 'mean': series.mean(), 'std': series.std(), 'min': series.min(), 'max': series.max(), 'range': series.max() - series.min(), 'iqr': series.quantile(0.75) - series.quantile(0.25) })
# Apply custom functiondf.groupby('category').agg(custom_agg)Lambda Functions in GroupBy
Section titled “Lambda Functions in GroupBy”Reference:
# Lambda functions for complex operationsdf.groupby('category').agg({ 'value': lambda x: x.quantile(0.95), # 95th percentile 'other': lambda x: x.nunique(), # Count unique values 'score': lambda x: (x > x.mean()).sum() # Count above mean})
# Multiple lambda functionsdf.groupby('category').agg({ 'value': [ lambda x: x.mean(), lambda x: x.std(), lambda x: x.quantile(0.25), lambda x: x.quantile(0.75) ]})GroupBy with Time Windows
Section titled “GroupBy with Time Windows”Reference:
# Time-based groupingdf['date'] = pd.to_datetime(df['date'])df.set_index('date', inplace=True)
# Group by time periodsdf.groupby(pd.Grouper(freq='M')).sum() # Monthlydf.groupby(pd.Grouper(freq='Q')).mean() # Quarterlydf.groupby(pd.Grouper(freq='A')).max() # Annual
# Custom time windowsdf.groupby(pd.Grouper(freq='7D')).agg({ 'value': ['sum', 'mean', 'count']})Advanced Pivot Table Operations
Section titled “Advanced Pivot Table Operations”Multi-Level Pivot Tables
Section titled “Multi-Level Pivot Tables”Reference:
# Multi-level pivot tablespivot = pd.pivot_table(df, values=['sales', 'profit'], index=['region', 'product'], columns=['quarter', 'year'], aggfunc={'sales': 'sum', 'profit': 'mean'}, fill_value=0, margins=True)
# Flatten multi-level columnspivot.columns = ['_'.join(col).strip() for col in pivot.columns]Pivot Table with Custom Functions
Section titled “Pivot Table with Custom Functions”Reference:
# Custom aggregation in pivot tablesdef weighted_average(group): """Calculate weighted average""" return np.average(group['value'], weights=group['weight'])
pivot = pd.pivot_table(df, values='value', index='category', columns='region', aggfunc=weighted_average)Pivot Table with Missing Data Handling
Section titled “Pivot Table with Missing Data Handling”Reference:
# Advanced missing data handlingpivot = pd.pivot_table(df, values='value', index='category', columns='region', aggfunc='mean', fill_value=0, # Fill missing with 0 dropna=False, # Keep missing combinations observed=True) # Include all categories
# Handle missing data in different wayspivot_filled = pivot.fillna(method='ffill') # Forward fillpivot_interpolated = pivot.interpolate() # Linear interpolationpivot_dropped = pivot.dropna() # Drop missing rowsHierarchical Grouping and MultiIndex
Section titled “Hierarchical Grouping and MultiIndex”MultiIndex Operations
Section titled “MultiIndex Operations”Reference:
# Create MultiIndexdf_multi = df.set_index(['level1', 'level2'])
# Operations on MultiIndexdf_multi.groupby(level=0).sum() # Group by first leveldf_multi.groupby(level=1).mean() # Group by second leveldf_multi.groupby(level=[0, 1]).max() # Group by both levels
# Swap levelsdf_multi.swaplevel(0, 1)
# Sort by indexdf_multi.sort_index()
# Access specific levelsdf_multi.loc[('A', 'X')] # Access specific combinationdf_multi.xs('A', level=0) # Cross-sectionAdvanced MultiIndex Grouping
Section titled “Advanced MultiIndex Grouping”Reference:
# Complex MultiIndex operationsdef hierarchical_analysis(df): """Perform hierarchical analysis"""
# Group by multiple levels grouped = df.groupby(['level1', 'level2', 'level3'])
# Apply different functions to different columns result = grouped.agg({ 'numeric_col': ['mean', 'std', 'count'], 'categorical_col': lambda x: x.mode().iloc[0] if not x.mode().empty else None, 'date_col': ['min', 'max'] })
# Flatten column names result.columns = ['_'.join(col).strip() for col in result.columns]
return resultPerformance Optimization
Section titled “Performance Optimization”Memory-Efficient GroupBy
Section titled “Memory-Efficient GroupBy”Reference:
# Memory-efficient groupby operationsdef memory_efficient_groupby(df, group_cols, agg_cols): """Optimize memory usage in groupby operations"""
# Use categorical data types for col in group_cols: if df[col].dtype == 'object': df[col] = df[col].astype('category')
# Use specific data types for col in agg_cols: if df[col].dtype == 'float64': df[col] = df[col].astype('float32') elif df[col].dtype == 'int64': df[col] = df[col].astype('int32')
# Perform groupby result = df.groupby(group_cols)[agg_cols].sum()
return resultChunked Processing
Section titled “Chunked Processing”Reference:
# Process large datasets in chunksdef chunked_groupby(file_path, group_cols, agg_cols, chunk_size=10000): """Process large file in chunks"""
results = []
for chunk in pd.read_csv(file_path, chunksize=chunk_size): # Process chunk chunk_result = chunk.groupby(group_cols)[agg_cols].sum() results.append(chunk_result)
# Combine results final_result = pd.concat(results).groupby(level=0).sum()
return final_resultParallel Processing
Section titled “Parallel Processing”Reference:
from multiprocessing import Poolimport pandas as pd
def process_chunk(chunk_data): """Process a single chunk""" return chunk_data.groupby('category').sum()
def parallel_groupby(df, n_processes=4): """Parallel groupby processing"""
# Split data into chunks chunk_size = len(df) // n_processes chunks = [df.iloc[i:i+chunk_size] for i in range(0, len(df), chunk_size)]
# Process in parallel with Pool(n_processes) as pool: results = pool.map(process_chunk, chunks)
# Combine results return pd.concat(results).groupby(level=0).sum()Advanced Statistical Aggregations
Section titled “Advanced Statistical Aggregations”Rolling Statistics
Section titled “Rolling Statistics”Reference:
# Rolling statistics within groupsdf['rolling_mean'] = df.groupby('category')['value'].rolling(window=5).mean()df['rolling_std'] = df.groupby('category')['value'].rolling(window=5).std()
# Expanding statisticsdf['expanding_sum'] = df.groupby('category')['value'].expanding().sum()df['expanding_mean'] = df.groupby('category')['value'].expanding().mean()Percentile Aggregations
Section titled “Percentile Aggregations”Reference:
# Custom percentile functionsdef percentile_agg(series): """Calculate multiple percentiles""" return pd.Series({ 'p25': series.quantile(0.25), 'p50': series.quantile(0.50), 'p75': series.quantile(0.75), 'p90': series.quantile(0.90), 'p95': series.quantile(0.95), 'p99': series.quantile(0.99) })
# Apply to groupsdf.groupby('category')['value'].apply(percentile_agg)Statistical Tests in Groups
Section titled “Statistical Tests in Groups”Reference:
from scipy import stats
def statistical_tests(group): """Perform statistical tests on group""" if len(group) < 3: return pd.Series({'test_stat': np.nan, 'p_value': np.nan})
# Normality test stat, p_value = stats.normaltest(group['value'])
return pd.Series({ 'normality_stat': stat, 'normality_p': p_value, 'mean': group['value'].mean(), 'std': group['value'].std(), 'skewness': stats.skew(group['value']), 'kurtosis': stats.kurtosis(group['value']) })
# Apply to groupsdf.groupby('category').apply(statistical_tests)Advanced Pivot Table Features
Section titled “Advanced Pivot Table Features”Pivot Table with Custom Index
Section titled “Pivot Table with Custom Index”Reference:
# Custom index in pivot tablespivot = pd.pivot_table(df, values='value', index=pd.cut(df['numeric_col'], bins=5), columns='category', aggfunc='mean')
# Multi-level indexpivot = pd.pivot_table(df, values='value', index=['level1', 'level2'], columns='category', aggfunc='sum')Pivot Table with Time Index
Section titled “Pivot Table with Time Index”Reference:
# Time-based pivot tablesdf['date'] = pd.to_datetime(df['date'])df['month'] = df['date'].dt.monthdf['year'] = df['date'].dt.year
pivot = pd.pivot_table(df, values='value', index=['year', 'month'], columns='category', aggfunc='sum', fill_value=0)Pivot Table with Custom Aggregation
Section titled “Pivot Table with Custom Aggregation”Reference:
# Custom aggregation in pivot tablesdef weighted_mean(group): """Calculate weighted mean""" return np.average(group['value'], weights=group['weight'])
pivot = pd.pivot_table(df, values='value', index='category', columns='region', aggfunc=weighted_mean, fill_value=0)Advanced GroupBy Transformations
Section titled “Advanced GroupBy Transformations”Ranking Within Groups
Section titled “Ranking Within Groups”Reference:
# Ranking within groupsdf['rank'] = df.groupby('category')['value'].rank(ascending=False)df['percentile'] = df.groupby('category')['value'].rank(pct=True)
# Multiple ranking methodsdf['rank_dense'] = df.groupby('category')['value'].rank(method='dense')df['rank_min'] = df.groupby('category')['value'].rank(method='min')df['rank_max'] = df.groupby('category')['value'].rank(method='max')Lag and Lead Operations
Section titled “Lag and Lead Operations”Reference:
# Lag and lead operations within groupsdf['value_lag1'] = df.groupby('category')['value'].shift(1)df['value_lag2'] = df.groupby('category')['value'].shift(2)df['value_lead1'] = df.groupby('category')['value'].shift(-1)
# Difference from previous valuedf['value_diff'] = df.groupby('category')['value'].diff()
# Percentage changedf['value_pct_change'] = df.groupby('category')['value'].pct_change()Window Functions
Section titled “Window Functions”Reference:
# Window functions within groupsdf['rolling_mean'] = df.groupby('category')['value'].rolling(window=3).mean()df['rolling_std'] = df.groupby('category')['value'].rolling(window=3).std()df['expanding_sum'] = df.groupby('category')['value'].expanding().sum()df['expanding_mean'] = df.groupby('category')['value'].expanding().mean()Custom GroupBy Classes
Section titled “Custom GroupBy Classes”Custom GroupBy Aggregator
Section titled “Custom GroupBy Aggregator”Reference:
class CustomGroupBy: """Custom groupby aggregator"""
def __init__(self, df, group_cols): self.df = df self.group_cols = group_cols self.grouped = df.groupby(group_cols)
def custom_agg(self, agg_col, func): """Apply custom aggregation function""" return self.grouped[agg_col].apply(func)
def multiple_aggs(self, agg_dict): """Apply multiple aggregations""" return self.grouped.agg(agg_dict)
def filter_groups(self, condition): """Filter groups based on condition""" return self.grouped.filter(condition)
def transform_groups(self, func): """Transform groups""" return self.grouped.transform(func)
# Usagecustom_gb = CustomGroupBy(df, ['category'])result = custom_gb.custom_agg('value', lambda x: x.quantile(0.95))Advanced Remote Computing
Section titled “Advanced Remote Computing”Distributed Computing
Section titled “Distributed Computing”Reference:
# Distributed computing with Daskimport dask.dataframe as dd
# Read large dataset with Daskdf = dd.read_csv('large_dataset.csv')
# Perform groupby operationsresult = df.groupby('category').agg({ 'value': ['sum', 'mean', 'count']}).compute()
# Save resultsresult.to_csv('distributed_results.csv')Cloud Computing
Section titled “Cloud Computing”Reference:
# Cloud computing with AWS/GCPimport boto3import pandas as pd
# Read from S3s3 = boto3.client('s3')df = pd.read_csv('s3://bucket/data.csv')
# Process dataresult = df.groupby('category').sum()
# Save back to S3result.to_csv('s3://bucket/results.csv')These advanced topics will help you handle complex aggregation scenarios and optimize performance for large datasets in your data science work.