Master Pandas fast! This Pandas Cheatsheet for Beginners offers quick references and clear examples for data analysis, manipulation, and more. Get started now!
Pandas Basics
Pandas is a Python library for data manipulation and analysis. It introduces two primary data structures: Series (1D labeled array) and DataFrame (2D labeled table).
import pandas as pd
import numpy as np # Often used with Pandas
# Creating a Series
s = pd.Series([1, 2, 3, 4, 5])
print(s)
# > 0 1
# > 1 2
# > 2 3
# > 3 4
# > 4 5
# > dtype: int64
s_labeled = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print(s_labeled)
# > a 10
# > b 20
# > c 30
# > dtype: int64
# Creating a DataFrame from a dictionary
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David'],
'Age': [25, 30, 35, 40],
'City': ['New York', 'London', 'Paris', 'New York']
}
df = pd.DataFrame(data)
print(df)
# > Name Age City
# > 0 Alice 25 New York
# > 1 Bob 30 London
# > 2 Charlie 35 Paris
# > 3 David 40 New York
# Creating a DataFrame from a list of lists (with columns)
data_list = [['John', 28, 'Berlin'], ['Anna', 22, 'Rome']]
df_list = pd.DataFrame(data_list, columns=['Name', 'Age', 'City'])
print(df_list)
# > Name Age City
# > 0 John 28 Berlin
# > 1 Anna 22 Rome
# Basic DataFrame Info
print(df.head(2)) # Displays first N rows > First 2 rows of df
print(df.tail(1)) # Displays last N rows > Last row of df
print(df.info()) # Summary of DataFrame, including dtypes and non-null values
print(df.describe()) # Statistical summary of numerical columns
print(df.shape) # Returns (rows, columns) tuple > (4, 3)
print(df.columns) # Returns column labels > Index(['Name', 'Age', 'City'], dtype='object')
print(df.index) # Returns row labels > RangeIndex(start=0, stop=4, step=1)
Data Selection and Indexing
Accessing specific rows, columns, or cells in a DataFrame.
# Select a single column (returns a Series)
print(df['Name'])
# > 0 Alice
# > 1 Bob
# > 2 Charlie
# > 3 David
# > Name: Name, dtype: object
# Select multiple columns (returns a DataFrame)
print(df[['Name', 'Age']])
# > Name Age
# > 0 Alice 25
# > 1 Bob 30
# > 2 Charlie 35
# > 3 David 40
# Select rows by index (integer-location based indexing - iloc)
print(df.iloc[0]) # First row (Series) > Name: Alice, Age: 25, City: New York
print(df.iloc[1:3]) # Rows from index 1 up to (but not including) 3
# > Name Age City
# > 1 Bob 30 London
# > 2 Charlie 35 Paris
# Select rows by label (label-location based indexing - loc)
df_indexed = df.set_index('Name') # Set 'Name' as index for loc examples
print(df_indexed.loc['Alice']) # Row(s) with label 'Alice' (Series)
# > Age 25
# > City New York
# > Name: Alice, dtype: object
print(df_indexed.loc[['Alice', 'David']]) # Multiple labeled rows
# > Age City
# > Name
# > Alice 25 New York
# > David 40 New York
# Select specific cell (row by label, column by label)
print(df_indexed.loc['Alice', 'Age']) # > 25
# Select specific cell (row by int index, column by int index)
print(df.iloc[0, 1]) # > 25
# Boolean indexing (filtering)
print(df[df['Age'] > 30]) # Rows where Age is greater than 30
# > Name Age City
# > 2 Charlie 35 Paris
# > 3 David 40 New York
print(df[(df['Age'] > 30) & (df['City'] == 'New York')]) # Multiple conditions
# > Name Age City
# > 3 David 40 New York
print(df[df['City'].isin(['London', 'Paris'])]) # Using isin()
# > Name Age City
# > 1 Bob 30 London
# > 2 Charlie 35 Paris
Data Manipulation
Modifying, adding, or transforming data within a DataFrame.
# Adding a new column
df['Country'] = 'USA'
print(df)
# > Name Age City Country
# > 0 Alice 25 New York USA
# > 1 Bob 30 London USA
# > 2 Charlie 35 Paris USA
# > 3 David 40 New York USA
# Adding a new column based on existing ones
df['Age_Group'] = np.where(df['Age'] >= 30, 'Adult', 'Young')
print(df)
# > Name Age City Country Age_Group
# > 0 Alice 25 New York USA Young
# > 1 Bob 30 London USA Adult
# > 2 Charlie 35 Paris USA Adult
# > 3 David 40 New York USA Adult
# Renaming columns
df_renamed = df.rename(columns={'Name': 'Full_Name', 'City': 'Location'})
print(df_renamed)
# > Full_Name Age Location Country Age_Group
# > 0 Alice 25 New York USA Young
# > 1 Bob 30 London USA Adult
# > 2 Charlie 35 Paris USA Adult
# > 3 David 40 New York USA Adult
# Dropping columns
df_no_country = df.drop('Country', axis=1) # axis=1 for columns
print(df_no_country)
# > Name Age City Age_Group
# > 0 Alice 25 New York Young
# > 1 Bob 30 London Adult
# > 2 Charlie 35 Paris Adult
# > 3 David 40 New York Adult
# Dropping rows (by index)
df_no_first_row = df.drop(0, axis=0) # axis=0 for rows
print(df_no_first_row)
# > Name Age City Country Age_Group
# > 1 Bob 30 London USA Adult
# > 2 Charlie 35 Paris USA Adult
# > 3 David 40 New York USA Adult
# Handling Missing Data (NaN - Not a Number)
data_missing = {'A': [1, 2, np.nan], 'B': [4, np.nan, 6]}
df_missing = pd.DataFrame(data_missing)
print(df_missing)
# > A B
# > 0 1.0 4.0
# > 1 2.0 NaN
# > 2 NaN 6.0
print(df_missing.dropna()) # Drops rows with any NaN values
# > A B
# > 0 1.0 4.0
print(df_missing.fillna(0)) # Fills NaN values with 0
# > A B
# > 0 1.0 4.0
# > 1 2.0 0.0
# > 2 0.0 6.0
print(df_missing['A'].fillna(df_missing['A'].mean())) # Fill with column mean
# > 0 1.0
# > 1 2.0
# > 2 1.5
# > Name: A, dtype: float64
Aggregation and Grouping
Summarizing data and performing operations on groups.
# Create a new DataFrame for aggregation examples
agg_data = {
'Category': ['A', 'B', 'A', 'B', 'A'],
'Value': [10, 20, 15, 25, 12],
'Count': [1, 1, 2, 1, 3]
}
df_agg = pd.DataFrame(agg_data)
print(df_agg)
# > Category Value Count
# > 0 A 10 1
# > 1 B 20 1
# > 2 A 15 2
# > 3 B 25 1
# > 4 A 12 3
# Basic aggregations
print(df_agg['Value'].sum()) # Sum of a column > 82
print(df_agg['Value'].mean()) # Mean of a column > 16.4
print(df_agg['Value'].min()) # Minimum value > 10
print(df_agg['Value'].max()) # Maximum value > 25
print(df_agg['Value'].count()) # Number of non-null values > 5
print(df_agg['Value'].median()) # Median value > 15.0
# Grouping data (groupby)
grouped_by_category = df_agg.groupby('Category')
print(grouped_by_category['Value'].sum()) # Sum of Value for each Category
# > Category
# > A 37
# > B 45
# > Name: Value, dtype: int64
print(df_agg.groupby('Category')['Value'].mean()) # Mean of Value for each Category
# > Category
# > A 12.333333
# > B 22.500000
# > Name: Value, dtype: float64
# Grouping with multiple aggregations (agg)
print(df_agg.groupby('Category').agg(
Total_Value=('Value', 'sum'),
Average_Value=('Value', 'mean'),
Max_Count=('Count', 'max')
))
# > Total_Value Average_Value Max_Count
# > Category
# > A 37 12.333333 3
# > B 45 22.500000 1
Combining DataFrames
Merging, joining, and concatenating DataFrames.
# Create two DataFrames for combining examples
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'value': [1, 2, 3, 4]})
df2 = pd.DataFrame({'key': ['B', 'D', 'E', 'F'], 'value_2': [5, 6, 7, 8]})
df3 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df4 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]})
# Concatenation (stacking DataFrames)
print(pd.concat([df3, df4])) # Concatenates rows by default (axis=0)
# > A B
# > 0 1 3
# > 1 2 4
# > 0 5 7
# > 1 6 8
print(pd.concat([df3, df4], axis=1)) # Concatenates columns
# > A B A B
# > 0 1 3 5 7
# > 1 2 4 6 8
# Merging (SQL-style joins)
print(pd.merge(df1, df2, on='key', how='inner')) # Only common keys
# > key value value_2
# > 0 B 2 5
# > 1 D 4 6
print(pd.merge(df1, df2, on='key', how='left')) # All keys from left, matching from right
# > key value value_2
# > 0 A 1 NaN
# > 1 B 2 5.0
# > 2 C 3 NaN
# > 3 D 4 6.0
print(pd.merge(df1, df2, on='key', how='right')) # All keys from right, matching from left
# > key value value_2
# > 0 B 2.0 5
# > 1 D 4.0 6
# > 2 E NaN 7
# > 3 F NaN 8
print(pd.merge(df1, df2, on='key', how='outer')) # All keys from both
# > key value value_2
# > 0 A 1.0 NaN
# > 1 B 2.0 5.0
# > 2 C 3.0 NaN
# > 3 D 4.0 6.0
# > 4 E NaN 7.0
# > 5 F NaN 8.0
Input/Output
Reading data from and writing data to various file formats.
# Create a dummy DataFrame to save
data_to_save = {'Col1': [10, 20, 30], 'Col2': ['A', 'B', 'C']}
df_io = pd.DataFrame(data_to_save)
# Saving to CSV
# df_io.to_csv('my_data.csv', index=False) # Saves DataFrame to 'my_data.csv'
# # 'index=False' prevents writing the DataFrame index as a column
# Reading from CSV
# df_read_csv = pd.read_csv('my_data.csv')
# print(df_read_csv)
# # > Col1 Col2
# # > 0 10 A
# # > 1 20 B
# # > 2 30 C
# Saving to Excel (requires openpyxl or xlwt/xlsxwriter)
# df_io.to_excel('my_data.xlsx', index=False, sheet_name='Sheet1')
# Reading from Excel
# df_read_excel = pd.read_excel('my_data.xlsx', sheet_name='Sheet1')
# print(df_read_excel)
# # > Col1 Col2
# # > 0 10 A
# # > 1 20 B
# # > 2 30 C
# Saving to JSON
# df_io.to_json('my_data.json', orient='records') # orient controls JSON format
# Reading from JSON
# df_read_json = pd.read_json('my_data.json', orient='records')
# print(df_read_json)
# # > Col1 Col2
# # > 0 10 A
# # > 1 20 B
# # > 2 30 C
This Pandas cheatsheet covers the most frequently used functions and concepts. As you become more familiar with Pandas, you’ll discover many more specialized operations, but this should be an excellent starting point for beginners!













