=================================================================================
Table 3451a. DataFrame Treatsheet.
Table 3451b. Cheatsheet of Pandas (for DataFrames).
Function |
Code |
DataFrame Operations |
Show DataFrame |
df.show() |
Show DataFrame with truncated columns |
df.show(truncate=False) |
Show DataFrame with limited rows |
df.show(n=10) |
Print DataFrame schema |
df.printSchema() |
Select columns |
df.select("column1", "column2") |
Select columns with aliases |
df.select(col("column1").alias("col1"), col("column2").alias("col2")) |
Filter rows |
df.filter(col("age") > 18) |
Filter rows with multiple conditions |
df.filter((col("age") > 18) & (col("gender") == "M")) |
Filter rows with SQL expression |
df.filter("age > 18 AND gender = 'M'") |
Filter rows with NULL values |
df.filter(col("column").isNull()) |
Filter rows with NOT NULL values |
df.filter(col("column").isNotNull()) |
Filter rows with IN clause |
df.filter(col("column").isin(1, 2, 3)) |
Filter rows with LIKE clause |
df.filter(col("name").like("J%")) |
Filter rows with RLIKE clause |
df.filter(col("name").rlike("J.*")) |
Filter rows with BETWEEN clause |
df.filter(col("age").between(18, 30)) |
Distinct values |
df.distinct() |
Distinct values of specific columns |
df.dropDuplicates(["column1", "column2"]) |
Sort by column |
df.sort("column") |
Sort by multiple columns |
df.sort("column1", "column2") |
Sort by column in descending order |
df.sort(col("column").desc()) |
Group by column |
df.groupBy("column") |
Group by multiple columns |
df.groupBy("column1", "column2") |
Aggregations (count, sum, avg, min, max) |
df.groupBy("column").agg(count("*").alias("count"), sum("value").alias("sum"), avg("value").alias("avg"), min("value").alias("min"), max("value").alias("max")) |
df.isna().sum() |
Is used to identify and count the number of missing (NaN) values in each column of a DataFrame |
Pivot table |
df.groupBy("column1").pivot("column2").agg(count("*")) |
Unpivot table |
df.select("column1", expr("stack(3, 'column2', column2, 'column3', column3, 'column4', column4) as (key, value)")).where("value is not null") |
Window functions (rank, dense_rank, percent_rank, row_number) |
from pyspark.sql.window import Window; window = Window.partitionBy("column1").orderBy("column2"); df.withColumn("rank", rank().over(window)) |
Lag and lead functions |
from pyspark.sql.window import Window; window = Window.partitionBy("column1").orderBy("column2"); df.withColumn("lag", lag("value", 1).over(window)).withColumn("lead", lead("value", 1).over(window)) |
Cumulative sum |
from pyspark.sql.window import Window; window = Window.partitionBy("column1").orderBy("column2"); df.withColumn("cumulative_sum", sum("value").over(window)) |
Cumulative max |
from pyspark.sql.window import Window; window = Window.partitionBy("column1").orderBy("column2"); df.withColumn("cumulative_max", max("value").over(window)) |
DataFrame Joins |
Inner join |
df1.join(df2, on="key", how="inner") |
Left outer join |
df1.join(df2, on="key", how="left") |
Right outer join |
df1.join(df2, on="key", how="right") |
Full outer join |
df1.join(df2, on="key", how="full") |
Left semi join |
df1.join(df2, on="key", how="leftsemi") |
Left anti join |
df1.join(df2, on="key", how="leftanti") |
Cross join |
df1.crossJoin(df2) |
Self join |
df.alias("t1").join(df.alias("t2"), on="key") |
Join with complex condition |
df1.join(df2, (df1.column1 == df2.column2) & (df1.column3 > df2.column4)) |
Join with multiple keys |
df1.join(df2, on=["key1", "key2"], how="inner") |
DataFrame Set Operations
|
Union |
df1.union(df2)
|
Union by name |
df1.unionByName(df2)
|
Intersect |
df1.intersect(df2)
|
Except |
df1.except(df2)
|
Subtract |
df1.subtract(df2)
|
DataFrame Sorting
|
Sort by column |
df.sort("column")
|
Sort by multiple columns |
df.sort("column1", "column2") |
Sort by column in ascending order |
df.sort(col("column").asc()) |
Sort by column in descending order |
df.sort(col("column").desc()) |
DataFrame Grouping and Aggregation |
Group by column |
df.groupBy("column") |
Group by multiple columns |
df.groupBy("column1", "column2") |
Aggregations (count, sum, avg, min, max) |
df.groupBy("column").agg(count("*").alias("count"), sum("value").alias("sum"), avg("value").alias("avg"), min("value").alias("min"), max("value").alias("max")) |
Aggregation with filter |
df.groupBy("column").agg(sum(when(col("value") > 100, col("value"))).alias("sum_filtered")) |
Aggregation with multiple filters |
df.groupBy("column").agg(sum(when(col("value") > 100, col("value"))).alias("sum_filtered1"), sum(when(col("value") < 50, col("value"))).alias("sum_filtered2")) |
Pivot table |
df.groupBy("column1").pivot("column2").agg(count("*")) |
Unpivot table |
df.select("column1", expr("stack(3, 'column2', column2, 'column3', column3, 'column4', column4) as (key, value)")).where("value is not null") |
DataFrame Explode and Flatten |
Explode array column |
df.select(explode("array_column")) |
Explode map column |
df.select(explode("map_column")) |
Flatten struct column |
df.select("*", col("struct_column.*"))
|
Flatten nested struct column |
df.select("*", col("nested_struct_column.level1.*"), col("nested_struct_column.level2.*"))
|
DataFrame Array Functions
|
Array contains |
df.filter(array_contains(col("array_column"), "value")) |
Array distinct |
df.select(array_distinct(col("array_column"))) |
Array except |
df.select(array_except(col("array_column1"), col("array_column2"))) |
Array intersect |
df.select(array_intersect(col("array_column1"), col("array_column2"))) |
Array join |
df.select(array_join(col("array_column"), ",")) |
Array max |
df.select(array_max(col("array_column"))) |
Array min |
df.select(array_min(col("array_column"))) |
Array position |
df.select(array_position(col("array_column"), "value")) |
Array remove |
df.select(array_remove(col("array_column"), "value"))
|
Array repeat |
df.select(array_repeat("value", 3))
|
Array size |
df.select(size(col("array_column")))
|
Array sort |
df.select(array_sort(col("array_column")))
|
Array union |
df.select(array_union(col("array_column1"), col("array_column2")))
|
Array zip |
df.select(arrays_zip(col("array_column1"), col("array_column2")))
|
DataFrame Map Functions |
Map contains key |
df.filter(col("map_column").getItem("key").isNotNull())
|
Map keys |
df.select(map_keys(col("map_column")))
|
Map values |
df.select(map_values(col("map_column")))
|
Map from entries |
df.select(map_from_entries(col("array_column")))
|
Map concat |
df.select(map_concat(col("map_column1"), col("map_column2")))
|
Map zip with |
df.select(map_zip_with(col("map_column1"), col("map_column2"), (k, v1, v2) => v1 + v2))
|
DataFrame Date and Timestamp Functions
|
Current date |
df.select(current_date()) |
Current timestamp |
df.select(current_timestamp()) |
Date add |
df.select(date_add(col("date_column"), 7)) |
Date format |
df.select(date_format(col("date_column"), "yyyy-MM-dd")) |
Date sub |
df.select(date_sub(col("date_column"), 7)) |
Date diff |
df.select(datediff(col("end_date"), col("start_date"))) |
To date |
df.select(to_date(col("timestamp_column"))) |
To timestamp |
df.select(to_timestamp(col("string column"), "yyyy-MM-dd HH:mm:ss")) |
Trunc |
df.select(trunc(col("timestamp_column"), "year")) |
DataFrame Miscellaneous Functions |
Coalesce |
df.select(coalesce(col("column1"), col("column2"), lit("default_value"))) |
When otherwise |
df.select(when(col("column") > 10, "GT10").when(col("column") < 5, "LT5").otherwise("BETWEEN")) |
Case when |
df.select(expr("CASE WHEN column1 > 10 THEN 'GT10' WHEN column1 < 5 THEN 'LT5' ELSE 'BETWEEN' END")) |
Concat |
df.select(concat(col("column1"), lit("_"), col("column2"))) |
Concat with separator |
df.select(concat_ws("_", col("column1"), col("column2"), col("column3"))) |
Substring |
df.select(substring(col("column"), 1, 5)) |
Substring index |
df.select(substring_index(col("column"), ".", 1)) |
Instr |
df.select(instr(col("column"), "substring")) |
Plot from CSV File/DataFrame
|
page4211, page4735, page4278, page4500. |
Table 3451c. Cheatsheet of Pandas.
Code |
Output |
Explaination |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
print(df)
print()
df_1 = df[df["Column1"]>1]
print(df_1) |
|
Select the rows under a condition (subset of dataframe) |
# Headers to extract headers_to_extract = ['A', 'C'] # Extract subset subset_df = df[headers_to_extract] |
|
Extracts specified columns (subset) from an existing DataFrame |
pd.read_csv(filename) |
|
Import csv file |
pd.read_table(filename) |
|
|
pd.read_excel(filename) |
|
|
pd.read_sql(query, connection_object) |
|
|
pd.read_json(json_string) |
|
|
df.to_csv("xyz.csv") |
|
Save the dataframe to csv file |
df = pd.read_csv(input_file, nrows = 100) |
|
First 100 rows to form a dataframe |
last_column = df.columns[-1] |
|
Identify the column name of the last column |
df.columns[1:-1] |
|
The last column and all other columns (except the first column) |
df.to_excel("xyz.csv") |
|
|
df.to_sql(table_name, connection_object) |
|
|
df.to_json(filename) |
|
|
my_string = "Yougui Liao"
print(my_string.find('L')) |
7 |
Returns the index of the first instance of the string inside the subject string, otherwise -1 |
my_string = "Yougui Liao"
print(my_string.replace('Y', 'C') |
Cougui Liao |
Replaces any instance of the first string with the second in my_string |
my_dictionary = {'Yougui': 10, 12: 'laptop', (0,0):'center'}
print(my_dictionary["Yougui"]) |
10 |
Access value using key |
my_dictionary = {'Yougui': 10, 12: 'laptop', (0,0):'center'}
print(my_dictionary.keys()) |
dict_keys(['Yougui', 12, (0, 0)]) |
Get all keys in a dictionary as a list |
my_dictionary = {'Yougui': 10, 12: 'laptop', (0,0):'center'}
print(my_dictionary.values()) |
dict_values([10, 'laptop', 'center']) |
Get all values in a dictionary as a list |
my_list = ["Yougui", "Liao"]
print(my_list.extend(["Globalsino", "com"]))
print(my_list) |
None
['Yougui', 'Liao', 'Globalsino', 'com'] |
Add multiple items to a list |
my_list = ["Yougui", "Liao"]
print(my_list.append(["Globalsino", "com"]))
print(my_list)
|
None
['Yougui', 'Liao', ['Globalsino', 'com']] |
Add a single item to a list |
my_list = ["Yougui", "Liao"]
del(my_list[1])
print(my_list)
|
['Yougui']
|
Delete the object of a list at a specified index |
my_list = ["Yougui", "Liao"]
new_list = my_list[:]
print(new_list) |
['Yougui', 'Liao'] |
Clone a list |
my_list = [1, 2]
print(sum(my_list)) |
3 |
Calculate the sum of a list of ints or floats |
a.add(4) |
|
Add an item to the set |
a.remove("Bye") |
|
Remove an item from a set |
a.difference(b) |
|
Returns set a minus b |
a.intersection(b) |
|
Returns intersection of set a and b |
Returns the union of set a and b |
|
a.union(b) |
a.issubset(b) |
|
Returns True if a is a subset of b, false otherwise |
a.issuperset(b) |
|
Returns True if a is a superset of b, false otherwise |
my_string[start:stop] my_collection[start:stop] my_tup[start:stop] |
|
Accessing a subset of data from a string, list, or tuple using element numbers from start to stop -1 |
try:
# Code to try to execute
except a:
# Code to execute if there is an error of type a
except b:
# Code to execute if there is an error of type b
except:
# Code to execute if there is any exception that has not been handled
else:
# Code to execute if there is no exception |
|
Try/Except |
soup = BeautifulSoup(html, 'html5lib') |
|
Parse HTML stored as a string |
soup.prettify() |
|
Returns formatted html |
soup.find(tag) |
|
Find the first instance of an HTML tag |
soup.find_all(tag) |
|
Find all instances of an HTML tag |
file.read() |
|
Reads the contents of a file |
file.append(content) |
|
Adds content to the end of a file |
# Dictionary to rename headers
rename_dict = {
'old_header1': 'new_header1',
'old_header2': 'new_header2',
'old_header3': 'new_header3'
}
# Rename the headers
df.rename(columns=rename_dict, inplace=True) |
|
Change the headers (column names) of a DataFrame |
df["ConstantVar"].value_counts() |
|
Count the occurrences of each unique value in the column named "ConstantVar" of the DataFrame. |
import pandas as pd
# Create a Series from a list
data = [10, 20, 30, 40, 50]
s = pd.Series(data)
print(s) |
|
Create a Series from a list. A Series is a one-dimensional labeled array in Pandas. It can be thought of as a single column of data with labels or indices for each element. You can create a Series from various data sources, such as lists, NumPy arrays, or dictionaries. |
import pandas as pd
# Create a Series from a list
data = [10, 20, 30, 40, 50]
s = pd.Series(data, index=["a", "b", "c", "d", "e"])
print(s) |
|
Create a Series from a list. Code. |
#Extract the Last 100 Values from the "Close" Column, .reset_index(drop=True) resets the index of the sliced data, dropping the old index and creating a new default integer index starting from 0:
current_col_data = df["Close"].iloc[-100:].reset_index (drop=True)
# Assign the Extracted Data to the New Column "Current":
new_df["Current"] = current_col_data |
|
Appends a new column named "Current" to an existing DataFrame new_df, containing the last 100 values from the "Close" column of another DataFrame df. |
data = {
'A': [1, 2, 3],
'B': [4.0, 5.5, 6.1],
'C': ['foo', 'bar', 'baz']
}
df = pd.DataFrame(data)
# Display the data types of each column
print(df.dtypes) |
A int64
B float64
C object
dtype: object
|
An attribute that returns a Series containing the data types of each column in a DataFrame |
pd.DataFrame(np.random.rand(4,3)) |
|
Create test/fake data: 3 columns and 4 rows of random floats |
df.plot.hist() |
|
Histogram |
df.plot.scatter(x="Column1", y="Column2") |
|
Scatter plot |
unique_dates = df['Age'].unique() |
|
Finding Unique Elements: Use the unique method to determine the unique elements in a column of a DataFrame. |
data = {
'A': [1, 2, 3],
'B': [4.0, 5.5, 6.1],
'C': ['foo', 'bar', 'baz'],
'D': [True, False, True]
}
df = pd.DataFrame(data)
# Display the unique data types of the DataFrame
print(df.dtypes.unique())
|
[dtype('int64') dtype('float64') dtype('O') dtype('bool')]
|
df.dtypes.unique() returns an array of the unique data types present in the DataFrame.
int64: 64-bit integers.
float64: 64-bit floating-point numbers.
O (object): columns with mixed types or strings.
bool: boolean values. |
num = ["int64", "float64"]
num_vars = list(df.select_dtypes(include=num))
print(num_vars)
df_liao = df[num_vars]
print(df_liao) |
|
num = ["int64", "float64"]: This line defines a list num that contains the data types int64 and float64. These represent 64-bit integers and 64-bit floating-point numbers, respectively.
num_vars = list(df.select_dtypes(include=num)):
df.select_dtypes(include=num) selects columns in the DataFrame df that have data types specified in the num list. This will include columns with data types int64 and float64.
list(...) converts the resulting selection of column names into a list.
num_vars will now be a list containing the names of columns in df that are either of type int64 or float64.
df_liao = df[num_vars]:
This creates a new DataFrame df_liao that contains only the columns from df whose names are in the num_vars list.
df_liao will be a subset of df with only the numeric columns. |
df.head(n) |
|
Look at the first n rows |
df.tail(n) |
|
Look at the last n rows |
df.shape() |
|
Giaves the number of rows and columns |
df.info() |
|
Information of index, Datatype and Memory |
df.describe() |
|
Summary statistics for numberical columns |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
print(df) |
|
Create a dictionary where each key-value pair corresponds to a column and its values |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
print(df)
print(df.loc[0, "Column1"])
print(df.loc[1, "Column1"])
print(df.loc[0, "Column2"]) |
1
2
5 |
Create a dictionary where each key-value pair corresponds to a column and its values, and then read cells. Code. |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
print(df)
print()
df.index=["a", "b", "c", "d"]
print(df) |
|
Replace/change index |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
df.index=["a", "b", "c", "d"]
print(df)
print(df.loc["a", "Column1"])
print(df.loc["b", "Column1"])
print(df.loc["a", "Column2"]) |
|
Access cells with "a", "b" ... indice. Code. |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
df.index=["a", "b", "c", "d"]
print(df)
print()
z = df.iloc[0:1, 0:2]
print(z) |
|
Slice to form a new DataFrame (a subset of the DataFrame). Code. |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
df.index=["a", "b", "c", "d"]
print(df)
print()
z = df.loc["a":"b", "Column2":"Column4"]
print(z) |
|
Slice to form a new DataFrame (a subset of the DataFrame). Using .loc with label-based indexing. Code. |
import pandas as pd
data = {
'Column1': [1, 2, 3, 4],
'Column2': [5, 6, 7, 8],
'Column3': [9, 10, 11, 12],
'Column4': [13, 14, 15, 16]
}
df = pd.DataFrame(data)
df.index=["a", "b", "c", "d"]
print(df)
print()
z = df.iloc[0:2, 1:3] # Here 0:2 selects rows at index positions 0 and 1, and 1:3 selects columns at index positions 1 and 2
print(z)
|
|
Slice to form a new DataFrame (a subset of the DataFrame). Using .iloc for integer-based indexing. Code. |
df_sorted = df.sort_values(by='Age') |
|
Sort the DataFrame by the 'Age' column in ascending order |
df_sorted_desc = df.sort_values(by='Score', ascending=False) |
|
Sort the DataFrame by the 'Score' column in descending order |
df.sort_index() |
|
Sort by labels along an axis |
import pandas as pd
input_file = r"G:\My Drive\GlobalSino2006\ICs\images4\fail_rates_5_wafers.csv"
fail_rates_df = pd.read_csv(input_file)
fail_rates_df.set_index('bin_number', inplace=True)
# Check if any NaNs are present after loading
print("NaNs present after loading:", fail_rates_df.isnull().values.any())
# Conditions matrix for each wafer
conditions = {
'Condition1': [1, 1, 1, 1, 1],
'Condition2': [1, 1, 0, 1, 0],
'Condition3': [0, 1, 0, 1, 0],
'Condition4': [0, 0, 0, 0, 1],
'Condition5': [0, 0, 0, 1, 1],
'Condition6': [0, 1, 0, 0, 0],
'Condition7': [0, 0, 0, 1, 0],
'Condition8': [0, 0, 1, 0, 1],
'Condition9': [0, 1, 1, 0, 0],
'Condition10': [0, 0, 1, 0, 0]
}
conditions_df = pd.DataFrame(conditions)
# Combine the fail rates with conditions
combined_data = pd.concat([fail_rates_df.mean().to_frame().T, conditions_df], ignore_index=True)
# Check combined data for any NaNs
print("NaNs present in combined data:", combined_data.isnull().values.any()) |
NaNs present after loading: False
NaNs present in combined data: True
Error during PCA: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. |
The error indicates that PCA detected NaN values in the input data, even though the CSV appears to have no missing values. This can sometimes occur due to issues in the data processing steps before PCA.
The script does:
Check for NaNs After Reading Data: Sometimes, if data is incorrectly formatted or there are hidden characters, pandas might interpret values as NaNs. Check if any NaN values are being introduced when reading the CSV file.
Verify Data Conversion and Handling: Ensure that all operations on the DataFrame, like setting indices or calculating means, are executed correctly without introducing NaNs.
Check Scaling Step: The StandardScaler might be encountering issues with the input format or specific data types. |
============================================
|