import numpy as np # numbers, arrays, math
import pandas as pd # tables, csv loading, data handling
import os
import matplotlib.pyplot as plt # plotting
import shap # shap values for model explainability
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier # decision tree model
from sklearn.ensemble import RandomForestClassifier # random forest model
from sklearn.linear_model import LogisticRegression # logistic regression model
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import cross_val_score # cross validation scoring
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
#note: micromamba activate data_science_foundation to pip install and resolve shap issues
Import Libraries Below
Here are all the imported libraries needed to do the full data analysis below:
EDA Assessments
Loading Data
Adopted from HW 3-4
#making a function to load the data (from HW 03-04)
#prompt the input for file path then load the data
def load_data():
try:
= input("Enter the folder path: ") #copy the path of the folder (right clicked data folder and copy path)
folder_name = [f for f in os.listdir(folder_name) if f.endswith('.csv')] #searches files with csv extension in that folder
file_list print(f"Files found in {folder_name}: {file_list}") #lists all matching files
for file in file_list: #tries to load each file into each variable separately
= os.path.splitext(file)[0] #'customers' from 'customers.csv' for example
var_name globals()[var_name] = pd.read_csv(os.path.join(folder_name, file)) #set each file name as a variable
print(f"Loaded {file} as variable '{var_name}'")
except FileNotFoundError: #error handling
print(f"Folder '{folder_name}' not found.")
return None
except Exception as e:
print(f"Error loading data: {e}")
return None
return True
#calling the function (copy data folder entire path into the prompt box)
load_data()
Files found in /Users/matthewthompson/Documents/Academics/DS Masters Academics/Data Mining and Discovery/Assignments/final-project-thompson/data: ['family_related_data.csv', 'animals_rateof_evolution.csv']
Loaded family_related_data.csv as variable 'family_related_data'
Loaded animals_rateof_evolution.csv as variable 'animals_rateof_evolution'
True
= family_related_data
family_df = animals_rateof_evolution evolution_df
print(family_df.info())
print(evolution_df.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Tree_Label 1087 non-null object
1 Phylum 1087 non-null object
2 SS 1087 non-null int64
3 A 1087 non-null int64
4 G 1087 non-null int64
5 O 1087 non-null int64
6 T 1087 non-null int64
7 V 1087 non-null int64
8 C 1087 non-null int64
9 F 1087 non-null int64
10 K 1087 non-null int64
11 M 1087 non-null int64
12 S 1087 non-null int64
dtypes: int64(11), object(2)
memory usage: 110.5+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Tree 84 non-null int64
1 Phylum 84 non-null object
2 A 84 non-null float64
3 G 84 non-null float64
4 O 84 non-null float64
5 T 84 non-null float64
6 V 84 non-null float64
7 C 84 non-null float64
8 F 84 non-null float64
9 K 84 non-null float64
10 M 84 non-null float64
11 S 84 non-null float64
dtypes: float64(10), int64(1), object(1)
memory usage: 8.0+ KB
None
First DataFrame (family dataset) This dataset has 1,087 rows and 13 columns, containing tree labels, phylum information, and 11 integer-coded traits (SS, A, G, …, S). It’s relatively large and purely categorical/numeric without missing values.
Second DataFrame (evolution dataset) This dataset is much smaller with 84 rows and 12 columns, including phylum and trait averages stored as floats. It provides summarized trait-level values per tree, making it more compact and aggregated than the first.
Family_df:
###### SHAPE AND INFO
#family dataset below
= family_df.copy()
df1 = "family_df"
dataset_name
# quick overview
print(f"\n=== {dataset_name} EDA ===")
print(f"Shape: {df1.shape[0]} rows × {df1.shape[1]} columns")
print("\nColumns:", list(df1.columns))
print("\nDtypes:\n", df1.dtypes)
# pick likely target + trait columns
= ['SS','A','G','O','T','V','C','F','K','M','S']
all_traits = [c for c in all_traits if c in df1.columns]
traits_present = 'Phylum' if 'Phylum' in df1.columns else None
target_col = [c for c in ['Tree_Label'] if c in df1.columns]
id_cols
#prints out the shape below
print("\nTarget column:", target_col)
print("Trait columns:", traits_present)
print("ID columns:", id_cols)
=== family_df EDA ===
Shape: 1087 rows × 13 columns
Columns: ['Tree_Label', 'Phylum', 'SS', 'A', 'G', 'O', 'T', 'V', 'C', 'F', 'K', 'M', 'S']
Dtypes:
Tree_Label object
Phylum object
SS int64
A int64
G int64
O int64
T int64
V int64
C int64
F int64
K int64
M int64
S int64
dtype: object
Target column: Phylum
Trait columns: ['SS', 'A', 'G', 'O', 'T', 'V', 'C', 'F', 'K', 'M', 'S']
ID columns: ['Tree_Label']
The family_df dataset has 1,087 rows and 13 columns, where Phylum is the categorical target, 11 numeric trait columns (SS–S) are features, and Tree_Label acts as an identifier. This makes it well-structured for classification tasks predicting phylum from trait patterns.
##### SUMMARY INFO numerics and categoricals
# numeric & categorical summaries:
= df1.select_dtypes(include='number')
numeric_df = df1.select_dtypes(include=['object','category'])
cat_df
#prints summaries below
if not numeric_df.empty:
print("\nSummary (numeric):\n", numeric_df.describe().T)
else:
print("\nNo numeric columns for numeric summary.")
if not cat_df.empty:
print("\nSummary (categorical):\n", cat_df.describe().T)
else:
print("\nNo categorical columns for categorical summary.")
Summary (numeric):
count mean std min 25% 50% 75% max
SS 1087.0 0.269549 0.443930 0.0 0.0 0.0 1.0 1.0
A 1087.0 0.037718 0.190602 0.0 0.0 0.0 0.0 1.0
G 1087.0 0.021159 0.143981 0.0 0.0 0.0 0.0 1.0
O 1087.0 0.103036 0.304146 0.0 0.0 0.0 0.0 1.0
T 1087.0 0.131555 0.338162 0.0 0.0 0.0 0.0 1.0
V 1087.0 0.093836 0.291735 0.0 0.0 0.0 0.0 1.0
C 1087.0 0.109476 0.312379 0.0 0.0 0.0 0.0 1.0
F 1087.0 0.181233 0.385388 0.0 0.0 0.0 0.0 1.0
K 1087.0 0.007360 0.085512 0.0 0.0 0.0 0.0 1.0
M 1087.0 0.107636 0.310062 0.0 0.0 0.0 0.0 1.0
S 1087.0 0.015639 0.124133 0.0 0.0 0.0 0.0 1.0
Summary (categorical):
count unique top freq
Tree_Label 1087 1087 1Laevipil 1
Phylum 1087 29 Arthropoda 917
The numeric traits are mostly binary (0/1) with low mean values, showing that most traits are absent in most samples, except SS (27%) and F (18%), which occur more frequently. Categorical data shows each Tree_Label is unique, and the dataset is highly imbalanced, with Arthropoda dominating (917 of 1087, ~84%) among the 29 phyla.
# missing values
= df1.isna().sum()
na = na[na > 0]
na
#if any NA, prints error otherwise missing values
if na.empty:
print("\nNo missing values detected.")
else:
print("\nColumns with nulls:\n", na)
print("\nNull %:\n", (na/len(df)*100).round(2))
No missing values detected.
# unique counts
print("\nUnique values per column:\n", df1.nunique())
Unique values per column:
Tree_Label 1087
Phylum 29
SS 2
A 2
G 2
O 2
T 2
V 2
C 2
F 2
K 2
M 2
S 2
dtype: int64
There are 29 unique phylum categories, while the other columns only have two unique values each, meaning they act like simple yes/no indicators for traits.
#outlier scan (IQR) – skip pure binary columns
print("\nOutlier scan (IQR):")
for col in numeric_df.columns:
= df1[col].dropna().unique()
vals #treat as binary if values subset of {0,1}
if set(vals).issubset({0,1,0.0,1.0}):
print(f"{col}: skipped (binary)")
continue
= df1[col].quantile(0.25)
Q1 = df1[col].quantile(0.75)
Q3 = Q3 - Q1
IQR = Q1 - 1.5*IQR
lo = Q3 + 1.5*IQR
hi = (df1[col] < lo) | (df1[col] > hi) #mark values below Q1−1.5*IQR or above Q3+1.5*IQR as outliers
mask = int(mask.sum()) #count how many outliers there are below or outside IQR
cnt = 100*cnt/len(df1) #percentage of outliers
pct print(f"{col}: {cnt} outliers ({pct:.2f}%)") #reports outlier count and %
Outlier scan (IQR):
SS: skipped (binary)
A: skipped (binary)
G: skipped (binary)
O: skipped (binary)
T: skipped (binary)
V: skipped (binary)
C: skipped (binary)
F: skipped (binary)
K: skipped (binary)
M: skipped (binary)
S: skipped (binary)
Since all trait columns are binary (0/1), an IQR-based outlier scan is not applicable — there are no true numeric outliers in this dataset. The traits can only vary by presence/absence, not by extreme values
# Skewness
if not numeric_df.empty:
with np.errstate(all='ignore'):
= numeric_df.drop(columns=id_cols, errors='ignore').skew(numeric_only=True)
skew_vals
print("\n=== Skewness ===")
for col, val in skew_vals.items():
print(f"{col:5s}: {val: .4f}")
else:
print("\nNo numeric columns for skewness in Family dataset.")
# Kurtosis
if not numeric_df.empty:
with np.errstate(all='ignore'):
= numeric_df.drop(columns=id_cols, errors='ignore').kurt(numeric_only=True)
kurt_vals
print("\n=== Kurtosis ===")
for col, val in kurt_vals.items():
print(f"{col:5s}: {val: .4f}")
else:
print("\nNo numeric columns for kurtosis in Family dataset.")
=== Skewness ===
SS : 1.0401
A : 4.8597
G : 6.6637
O : 2.6152
T : 2.1831
V : 2.7896
C : 2.5049
F : 1.6573
K : 11.5434
M : 2.5355
S : 7.8183
=== Kurtosis ===
SS : -0.9198
A : 21.6564
G : 42.4832
O : 4.8480
T : 2.7711
V : 5.7925
C : 4.2826
F : 0.7481
K : 131.4920
M : 4.4371
S : 59.2347
All traits are positively skewed, showing that “1” values are rare compared to “0”, with especially strong skew for K, S, and G. The very high kurtosis values (e.g., K = 131, S = 59) confirm that most traits are extremely concentrated near 0 with occasional 1s, reflecting strong sparsity in the dataset.All traits are positively skewed, meaning trait presence (1) is rare compared to absence (0). The strongest skew and highest kurtosis occur in K (female-female competition), S (intersexual conflict), and G (gustatory), showing these traits are extremely sparse, while more common traits like SS (any sexually selected trait) and F (female choice) are less skewed.
# correlations (absolute) among numeric
if not numeric_df.empty:
= numeric_df.corr().abs()
corr print("\nCorrelation matrix (|r|):\n", corr)
if corr is not None:
=(8,6))
plt.figure(figsize=True, cmap="coolwarm", fmt=".2f", cbar=True)
sns.heatmap(corr, annot"Absolute Correlation Matrix (Family dataset traits)")
plt.title( plt.show()
Correlation matrix (|r|):
SS A G O T V C \
SS 1.000000 0.315031 0.242030 0.551115 0.634572 0.529735 0.577181
A 0.315031 1.000000 0.138659 0.107623 0.051520 0.135007 0.100703
G 0.242030 0.138659 1.000000 0.097360 0.056249 0.128062 0.010604
O 0.551115 0.107623 0.097360 1.000000 0.100862 0.119243 0.123462
T 0.634572 0.051520 0.056249 0.100862 1.000000 0.294774 0.639344
V 0.529735 0.135007 0.128062 0.119243 0.294774 1.000000 0.453005
C 0.577181 0.100703 0.010604 0.123462 0.639344 0.453005 1.000000
F 0.763723 0.370670 0.295909 0.539708 0.375067 0.438281 0.202182
K 0.141746 0.039448 0.012660 0.041626 0.157547 0.119935 0.211111
M 0.571720 0.274021 0.134572 0.722016 0.137071 0.163090 0.039847
S 0.207496 0.052882 0.136029 0.054837 0.258047 0.061146 0.027045
F K M S
SS 0.763723 0.141746 0.571720 0.207496
A 0.370670 0.039448 0.274021 0.052882
G 0.295909 0.012660 0.134572 0.136029
O 0.539708 0.041626 0.722016 0.054837
T 0.375067 0.157547 0.137071 0.258047
V 0.438281 0.119935 0.163090 0.061146
C 0.202182 0.211111 0.039847 0.027045
F 1.000000 0.043313 0.522427 0.171674
K 0.043313 1.000000 0.074283 0.010853
M 0.522427 0.074283 1.000000 0.027996
S 0.171674 0.010853 0.027996 1.000000
The strongest associations are SS–F (0.76), O–M (0.72), T–C (0.64), and SS–T (0.63), showing that combined sexually selected traits strongly co-occur with female choice, and olfactory traits with male choice. Visual (V), tactile (T), and male–male competition (C) also form a moderately correlated cluster, while weaker links (e.g., K and S) indicate that female–female competition and intersexual conflict occur largely independently of other traits
# plots – trait prevalence, phylum distribution, histograms (binary will show as 0/1 counts)
#prevalence of binary traits
if traits_present:
= df1[traits_present].mean().sort_values(ascending=False)
prev =(10,4))
plt.figure(figsize
plt.bar(prev.index, prev.values)"Trait prevalence (proportion present)")
plt.title("Prevalence")
plt.ylabel("Traits")
plt.xlabel(=45, ha='right')
plt.xticks(rotation
plt.show()
#phylum distribution
if target_col:
= df1[target_col].value_counts()
counts =(12,4))
plt.figure(figsize
plt.bar(counts.index, counts.values)f"{target_col} distribution")
plt.title("Count")
plt.ylabel("Phylum Name")
plt.xlabel(=90)
plt.xticks(rotation
plt.show()
= [c for c in traits_present if c in df1.columns]
binary_cols
# histograms for binary traits (0/1 only)
if binary_cols:
= len(binary_cols)
n = min(5, n)
ncols = (n + ncols - 1) // ncols
nrows = plt.subplots(
fig, axes =nrows, ncols=ncols,
nrows=(2.5*ncols, 2*nrows) # shrink each plot
figsize
)= np.array(axes).reshape(-1) if isinstance(axes, np.ndarray) else [axes]
axes
for i, col in enumerate(binary_cols):
= axes[i]
ax =[-0.5, 0.5, 1.5], edgecolor="black")
ax.hist(df1[col].dropna(), bins=9) # smaller title font
ax.set_title(col, fontsize0, 1])
ax.set_xticks(["Binary", fontsize=8)
ax.set_xlabel("Count", fontsize=8)
ax.set_ylabel(
for ax in axes[n:]:
'off')
ax.axis(
=1.0) # tighter spacing
plt.tight_layout(pad plt.show()
# traits by phylum (prevalence), if target exists
if target_col and traits_present:
= df1.groupby(target_col)[traits_present].mean().sort_index()
group_prev print("\nTrait prevalence by phylum (proportion present):\n", group_prev.round(3))
#barplot per trait aggregated (wide)
= group_prev.plot(kind='bar', figsize=(12, max(4, 0.35*len(group_prev.columns))))
ax f"Trait prevalence by {target_col}")
ax.set_title("Prevalence")
ax.set_ylabel(=45, ha='right')
plt.xticks(rotation
plt.tight_layout() plt.show()
Trait prevalence by phylum (proportion present):
SS A G O T V C F \
Phylum
Acoela 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Annelida 0.500 0.000 0.000 0.500 0.500 0.000 0.250 0.25
Arthropoda 0.296 0.037 0.025 0.115 0.149 0.098 0.115 0.20
Brachiopoda 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Bryozoa 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Chaetognatha 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Chordata 0.346 0.135 0.000 0.077 0.077 0.212 0.231 0.25
Cnidaria 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Ctenophora 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Echinodermata 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Entoprocta 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Gastrotricha 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Gnathostomulida 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Hemichordata 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Kinorhyncha 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Mollusca 0.014 0.000 0.000 0.000 0.000 0.014 0.014 0.00
Nematoda 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Nematomorpha 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Nemertea 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Onychophora 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Phoronida 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Placozoa 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Platyhelminthes 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Porifera 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Priapulida 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Rotifera 1.000 0.000 0.000 1.000 0.000 0.000 0.000 0.00
Sipuncula 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Tardigrada 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
Xenoturbellida 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.00
K M S
Phylum
Acoela 0.000 0.000 0.000
Annelida 0.250 0.250 0.000
Arthropoda 0.007 0.122 0.019
Brachiopoda 0.000 0.000 0.000
Bryozoa 0.000 0.000 0.000
Chaetognatha 0.000 0.000 0.000
Chordata 0.019 0.058 0.000
Cnidaria 0.000 0.000 0.000
Ctenophora 0.000 0.000 0.000
Echinodermata 0.000 0.000 0.000
Entoprocta 0.000 0.000 0.000
Gastrotricha 0.000 0.000 0.000
Gnathostomulida 0.000 0.000 0.000
Hemichordata 0.000 0.000 0.000
Kinorhyncha 0.000 0.000 0.000
Mollusca 0.000 0.000 0.000
Nematoda 0.000 0.000 0.000
Nematomorpha 0.000 0.000 0.000
Nemertea 0.000 0.000 0.000
Onychophora 0.000 0.000 0.000
Phoronida 0.000 0.000 0.000
Placozoa 0.000 0.000 0.000
Platyhelminthes 0.000 0.000 0.000
Porifera 0.000 0.000 0.000
Priapulida 0.000 0.000 0.000
Rotifera 0.000 1.000 0.000
Sipuncula 0.000 0.000 0.000
Tardigrada 0.000 0.000 0.000
Xenoturbellida 0.000 0.000 0.000
#trait prevalence by phylum
= df1.groupby("Phylum")[['SS','A','G','O','T','V','C','F','K','M','S']].mean()
group_prev
#filter to only show phylum × traits where prevalence > 0
= group_prev[group_prev > 0].dropna(how="all")
filtered_prev
print("Non-zero trait prevalence by phylum (df1):\n")
print(filtered_prev)
Non-zero trait prevalence by phylum (df1):
SS A G O T V \
Phylum
Annelida 0.500000 NaN NaN 0.500000 0.500000 NaN
Arthropoda 0.295529 0.037077 0.025082 0.114504 0.149400 0.098146
Chordata 0.346154 0.134615 NaN 0.076923 0.076923 0.211538
Mollusca 0.014286 NaN NaN NaN NaN 0.014286
Rotifera 1.000000 NaN NaN 1.000000 NaN NaN
C F K M S
Phylum
Annelida 0.250000 0.250000 0.250000 0.250000 NaN
Arthropoda 0.114504 0.199564 0.006543 0.122137 0.018539
Chordata 0.230769 0.250000 0.019231 0.057692 NaN
Mollusca 0.014286 NaN NaN NaN NaN
Rotifera NaN NaN NaN 1.000000 NaN
Trait prevalence – The most common trait is SS (any sexually selected trait, ~27%), followed by F (female choice, ~18%) and T (tactile, ~13%), while traits like K (female–female competition), S (intersexual conflict), and G (gustatory) are rare (<3%).
Phylum distribution – The dataset is highly imbalanced, with Arthropoda dominating (~84%), while Mollusca and Chordata make up smaller fractions, and most other phyla have very few samples.
Trait histograms – All traits are binary with strong skew toward 0 (absence), confirming sparsity and explaining the high skewness/kurtosis values.
Trait prevalence by phylum – Certain phyla (e.g., Arthropoda, Chordata, Annelida) show meaningful diversity of trait presence, while most phyla exhibit almost no sexually selected traits in the dataset.
Evolution df:
#making a copy of evolution_df
= evolution_df.copy()
df2
# quick overview
print(f"\n=== df2 EDA (evolution_df) ===")
print(f"Shape: {df2.shape[0]} rows × {df2.shape[1]} columns")
print("\nColumns:", list(df2.columns))
print("\nDtypes:\n", df2.dtypes)
# target and trait columns
= ['A','G','O','T','V','C','F','K','M','S'] # continuous traits
all_traits = [c for c in all_traits if c in df2.columns]
traits_present = 'Phylum' if 'Phylum' in df2.columns else None
target_col = [c for c in ['Tree'] if c in df2.columns]
id_cols
#printed shapes below
print("\nTarget column:", target_col)
print("Trait columns:", traits_present)
print("ID columns:", id_cols)
=== df2 EDA (evolution_df) ===
Shape: 84 rows × 12 columns
Columns: ['Tree', 'Phylum', 'A', 'G', 'O', 'T', 'V', 'C', 'F', 'K', 'M', 'S']
Dtypes:
Tree int64
Phylum object
A float64
G float64
O float64
T float64
V float64
C float64
F float64
K float64
M float64
S float64
dtype: object
Target column: Phylum
Trait columns: ['A', 'G', 'O', 'T', 'V', 'C', 'F', 'K', 'M', 'S']
ID columns: ['Tree']
The evolution_df dataset (84 rows × 12 columns) tracks evolutionary origin rates of 10 sexually selected traits (A–S) across phyla, providing continuous rate-based predictors rather than the binary presence/absence data of family_df.
# numeric & categorical summaries
= df2.select_dtypes(include='number')
numeric_df = df2.select_dtypes(include=['object','category'])
cat_df
if not numeric_df.empty:
print("\nSummary (numeric):\n", numeric_df.describe().T)
else:
print("\nNo numeric columns for numeric summary.")
if not cat_df.empty:
print("\nSummary (categorical):\n", cat_df.describe().T)
else:
print("\nNo categorical columns for categorical summary.")
Summary (numeric):
count mean std min 25% 50% 75% max
Tree 84.0 2.000000 0.821401 1.0 1.0 2.0 3.0 3.000000
A 84.0 0.000019 0.000069 0.0 0.0 0.0 0.0 0.000301
G 84.0 0.000063 0.000528 0.0 0.0 0.0 0.0 0.004833
O 84.0 0.000028 0.000100 0.0 0.0 0.0 0.0 0.000400
T 84.0 0.000056 0.000230 0.0 0.0 0.0 0.0 0.001200
V 84.0 0.000117 0.000456 0.0 0.0 0.0 0.0 0.002300
C 84.0 0.000066 0.000234 0.0 0.0 0.0 0.0 0.001000
F 84.0 0.000089 0.000323 0.0 0.0 0.0 0.0 0.001341
K 84.0 0.000005 0.000018 0.0 0.0 0.0 0.0 0.000088
M 84.0 0.000028 0.000106 0.0 0.0 0.0 0.0 0.000509
S 84.0 0.000004 0.000020 0.0 0.0 0.0 0.0 0.000108
Summary (categorical):
count unique top freq
Phylum 84 28 Tardigrada 3
The evolution_df traits have extremely small mean rates (mostly near 0 with rare spikes), showing that sexually selected traits originate very infrequently across lineages; meanwhile, the categorical data span 28 phyla with a relatively even spread (max frequency only 3 for Tardigrada), making this dataset far more balanced than family_df.
# missing values
= df2.isna().sum()
na = na[na > 0]
na if na.empty:
print("\nNo missing values detected.")
else:
print("\nColumns with nulls:\n", na)
print("\nNull %:\n", (na/len(df2)*100).round(2))
No missing values detected.
# unique counts
print("\nUnique values per column:\n", df2.nunique())
Unique values per column:
Tree 3
Phylum 28
A 6
G 3
O 5
T 6
V 5
C 6
F 6
K 5
M 6
S 4
dtype: int64
Each trait in evolution_df only has a few unique rate values (mostly 3–6), so even though they’re stored as continuous numbers, the values are very small and clustered. The dataset still covers 28 phyla, giving it wide taxonomic coverage but not much variation within each trait.
#outlier scan (IQR) – continuous traits only
print("\nOutlier scan (IQR):")
for col in numeric_df.columns:
if col in id_cols: # skip ID
print(f"{col}: skipped (ID)")
continue
= df2[col].quantile(0.25)
Q1 = df2[col].quantile(0.75)
Q3 = Q3 - Q1
IQR = Q1 - 1.5*IQR
lo = Q3 + 1.5*IQR
hi = (df2[col] < lo) | (df2[col] > hi) #mark values below or above IQR method for detecting outliers above
mask = int(mask.sum()) #counts these values
cnt = 100*cnt/len(df2) #percentage of outliers
pct print(f"{col}: {cnt} outliers ({pct:.2f}%)")
Outlier scan (IQR):
Tree: skipped (ID)
A: 6 outliers (7.14%)
G: 3 outliers (3.57%)
O: 6 outliers (7.14%)
T: 6 outliers (7.14%)
V: 9 outliers (10.71%)
C: 9 outliers (10.71%)
F: 6 outliers (7.14%)
K: 6 outliers (7.14%)
M: 6 outliers (7.14%)
S: 3 outliers (3.57%)
follows up with the outlier details below:
#traits to scan (exclude ID/target)
= [c for c in df2.columns if c not in ["Tree", "Phylum"]]
trait_cols
#IQR outlier finder (returns both high and low outliers)
def find_outliers_iqr(d, col):
= d[col].quantile(0.25)
q1 = d[col].quantile(0.75)
q3 = q3 - q1
iqr = q1 - 1.5 * iqr
lower = q3 + 1.5 * iqr
upper = d[(d[col] < lower) | (d[col] > upper)].copy()
out "OutlierSide"] = out[col].apply(lambda x: "low" if x < lower else "high")
out["Trait"] = col
out[return out[["Trait", "Phylum", col, "OutlierSide"]].rename(columns={col: "Value"})
#dict of DataFrames per trait
= {col: find_outliers_iqr(df2, col) for col in trait_cols}
outlier_dict
#tidy table of all outliers (stacked)
= pd.concat(outlier_dict.values(), ignore_index=True)
outliers_tidy
#counts by trait
= (
outlier_counts_by_trait "Trait", as_index=False)
outliers_tidy.groupby(
.size()={"size": "n_outliers"})
.rename(columns"n_outliers", ascending=False)
.sort_values(
)
#phyla contributing outliers per trait
= (
phyla_per_trait "Trait")["Phylum"]
outliers_tidy.groupby(apply(lambda s: ", ".join(sorted(s.unique())))
.="Phyla_with_outliers")
.reset_index(name
)
#combined into one summary
= outlier_counts_by_trait.merge(phyla_per_trait, on="Trait", how="left")
outlier_summary
print("Outlier summary (IQR):")
print(outlier_summary.to_string(index=False))
Outlier summary (IQR):
Trait n_outliers Phyla_with_outliers
C 9 Arthropoda, Chordata, Mollusca
V 9 Arthropoda, Chordata, Mollusca
A 6 Arthropoda, Chordata
F 6 Arthropoda, Chordata
K 6 Arthropoda, Chordata
M 6 Arthropoda, Chordata
O 6 Arthropoda, Chordata
T 6 Arthropoda, Chordata
G 3 Arthropoda
S 3 Arthropoda
Outliers in trait evolution rates are almost entirely concentrated in Arthropoda and Chordata, with Mollusca contributing only for Visual and Male–male competition, while all other phyla show none
# Skewness
if not numeric_df.empty:
with np.errstate(all='ignore'):
= numeric_df.drop(columns=id_cols, errors='ignore').skew(numeric_only=True)
skew_vals
print("\n=== Skewness ===")
for col, val in skew_vals.items():
print(f"{col:5s}: {val: .4f}")
else:
print("\nNo numeric columns for skewness.")
# Kurtosis
if not numeric_df.empty:
with np.errstate(all='ignore'):
= numeric_df.drop(columns=id_cols, errors='ignore').kurt(numeric_only=True)
kurt_vals
print("\n=== Kurtosis ===")
for col, val in kurt_vals.items():
print(f"{col:5s}: {val: .4f}")
else:
print("\nNo numeric columns for kurtosis.")
=== Skewness ===
A : 3.5053
G : 9.0968
O : 3.3938
T : 4.4907
V : 4.2405
C : 3.4515
F : 3.4248
K : 4.0070
M : 3.8618
S : 5.0951
=== Kurtosis ===
A : 10.8319
G : 83.1277
O : 9.7628
T : 19.7685
V : 17.6104
C : 10.3801
F : 10.0587
K : 15.5086
M : 14.1807
S : 24.5445
All trait origin rates are strongly right-skewed with high kurtosis, meaning most values sit near zero while a few phyla show much higher rates. The most extreme cases are Gustatory (G) and Intersexual conflict (S), which are both very rare and unevenly distributed. The least skewed are Olfactory (O, ~3.39) and Female choice (F, ~3.42), but even these remain heavily tilted to the right rather than balanced.
#correlations
= [c for c in traits_present if c in df2.columns]
cont_cols if cont_cols:
= df2[cont_cols].corr().abs()
corr print("\nCorrelation matrix (|r|):\n", corr.round(3))
# Heatmap
=(10, 7))
plt.figure(figsize
sns.heatmap(
corr,=True,
annot="coolwarm",
cmap=".2f",
fmt=0.5,
linewidths={"shrink": 0.8, "label": "|r|"}
cbar_kws
)"Absolute Correlation Matrix — Evolution Traits", fontsize=14, pad=12)
plt.title(=45, ha="right")
plt.xticks(rotation=0)
plt.yticks(rotation
plt.tight_layout()
plt.show()else:
print("No continuous trait columns found in df2.")
Correlation matrix (|r|):
A G O T V C F K M S
A 1.000 0.365 0.990 0.805 0.962 0.967 0.998 0.980 0.906 0.585
G 0.365 1.000 0.450 0.603 0.209 0.484 0.383 0.257 0.542 0.628
O 0.990 0.450 1.000 0.879 0.915 0.992 0.996 0.942 0.955 0.691
T 0.805 0.603 0.879 1.000 0.613 0.928 0.842 0.670 0.980 0.952
V 0.962 0.209 0.915 0.613 1.000 0.862 0.942 0.997 0.756 0.342
C 0.967 0.484 0.992 0.928 0.862 1.000 0.981 0.896 0.982 0.770
F 0.998 0.383 0.996 0.842 0.942 0.981 1.000 0.964 0.932 0.638
K 0.980 0.257 0.942 0.670 0.997 0.896 0.964 1.000 0.802 0.410
M 0.906 0.542 0.955 0.980 0.756 0.982 0.932 0.802 1.000 0.874
S 0.585 0.628 0.691 0.952 0.342 0.770 0.638 0.410 0.874 1.000
Most sexually selected traits in evolution_df are almost perfectly correlated, forming a tight co-evolving cluster (auditory, olfactory, visual, competition, and choice traits). The only exception is gustatory (G), which shows weaker links and stands out as the most independent trait.
# plots – histograms, phylum distribution, presence prevalence
#phylum distribution
if target_col:
= df2[target_col].value_counts()
counts =(10,4))
plt.figure(figsize
plt.bar(counts.index, counts.values)f"{target_col} distribution")
plt.title("Count")
plt.ylabel(=90)
plt.xticks(rotation
plt.show()
#presence prevalence derived from rates (>0)
if cont_cols:
= (df2[cont_cols] > 0).mean().sort_values(ascending=False)
presence =(10,4))
plt.figure(figsize
plt.bar(presence.index, presence.values)"Presence prevalence from rates (>0)")
plt.title("Prevalence (mean)")
plt.ylabel("Traits")
plt.xlabel(=45, ha='right')
plt.xticks(rotation plt.show()
In evolution_df, all traits are highly right-skewed with most values at zero and only a few higher rates, while phyla are evenly represented (3 each). The most common evolving traits are Visual (V) and Male–male competition (C), whereas Gustatory (G) and Intersexual conflict (S) are the rarest
#trait means by phylum
if target_col and cont_cols:
= df2.groupby(target_col)[cont_cols].mean().sort_index()
means_by_phylum print("\nTrait means by phylum:\n", means_by_phylum.round(3))
= means_by_phylum.plot(kind='bar', figsize=(12, max(4, 0.35*len(means_by_phylum.columns))))
ax f"Trait means by {target_col}")
ax.set_title("Mean rate")
ax.set_ylabel(=45, ha='right')
plt.xticks(rotation
plt.tight_layout() plt.show()
Trait means by phylum:
A G O T V C F K M S
Phylum
Acoela 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Annelida 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Arthropoda 0.0 0.002 0.0 0.001 0.001 0.001 0.001 0.0 0.001 0.0
Brachiopoda 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Bryozoa 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Chaetognatha 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Chordata 0.0 0.000 0.0 0.000 0.002 0.001 0.001 0.0 0.000 0.0
Cnidaria 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Ctenophora 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Echinodermata 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Entoprocta 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Gastrotricha 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Gnathostomulida 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Hemichordata 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Kinorhynca 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Mollusca 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Nematoda 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Nematomorpha 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Nemertea 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Onychophora 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Phoronida 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Placozoa 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Platyhelminthes 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Porifera 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Priapulida 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Rotifera 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Tardigrada 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
Xenoturbellida 0.0 0.000 0.0 0.000 0.000 0.000 0.000 0.0 0.000 0.0
#filter df2 trait means by phylum to only show > 0 values
= df2.groupby("Phylum")[['A','G','O','T','V','C','F','K','M','S']].mean()
trait_means = trait_means[trait_means > 0].dropna(how="all")
filtered
print("Non-zero trait means by phylum:\n")
print(filtered)
Non-zero trait means by phylum:
A G O T V C \
Phylum
Arthropoda 0.000227 0.001774 0.000384 0.001187 0.000923 0.000997
Chordata 0.000299 NaN 0.000387 0.000377 0.002300 0.000800
Mollusca NaN NaN NaN NaN 0.000059 0.000059
F K M S
Phylum
Arthropoda 0.001152 0.000043 0.000506 0.000108
Chordata 0.001333 0.000087 0.000276 NaN
Mollusca NaN NaN NaN NaN
Across phyla, evolutionary trait rates are largely near zero, but a few groups show weak signals: Arthropoda (notably G, T, V, C, F, M), Chordata (V, C, F), Cnidaria and Mollusca (minor nonzero values), while nearly all others remain flat at zero.
Overall Key Insights
Family - Dataset structure: The dataset has 1,087 rows × 13 columns, with Phylum as the categorical target, 11 binary-coded sexually selected traits (SS–S) as predictors, and Tree_Label as a unique ID. - Severe class imbalance: The Arthropoda phylum dominates (~84%), while Mollusca (~8%) and Chordata (~5%) are minor classes, and most other phyla have only a handful of samples. This imbalance will strongly affect classification performance. - Trait prevalence patterns: Most traits are rare, with SS (any sexually selected trait, ~27%) and F (female choice, ~18%) being the most frequent. In contrast, K (female–female competition), S (intersexual conflict), and G (gustatory) appear in <3% of samples, meaning they contribute very sparse signals. - Distribution shape: Because traits are binary and sparse, they show strong positive skew and extreme kurtosis, indicating most samples lack a given trait. This makes the dataset sparse and unbalanced at both feature and target levels. - Correlation structure: Traits cluster into co-occurring groups: - SS and F are very tightly linked (r ≈ 0.76), - Olfactory (O) and Male choice (M) also correlate strongly (r ≈ 0.72), - Tactile (T) and Male–male competition (C) pair closely (r ≈ 0.64). - These clusters suggest certain evolutionary mechanisms tend to evolve together. - By phylum trait diversity: Only a few phyla (Arthropoda, Chordata, Annelida) show meaningful variation in sexually selected traits. Most phyla have near-zero prevalence, meaning trait-driven classification will be heavily biased toward the larger groups.
Evolution - Dataset structure: 84 rows × 12 columns; 10 continuous trait rate columns (A–S), Phylum as target, Tree as ID. - Class balance: 28 phyla, each appearing exactly 3 times → perfectly balanced distribution. - Trait distributions: All traits are heavily right-skewed with high kurtosis, clustered near zero with only a few high values. - Outliers: Concentrated almost entirely in Arthropoda and Chordata, with Mollusca contributing for Visual (V) and Male–male competition (C). - Trait prevalence (nonzero rates): Most common are Visual (V) and Male–male competition (C) (~11%), while Gustatory (G) and Intersexual conflict (S) are the rarest (~4%). [“Presence prevalence from rates (>0 → 1)” bar chart] - Correlations: Nearly all traits co-evolve in a tight cluster (correlations >0.95), with Gustatory (G) standing out as the least connected trait.
Family_df shows which traits are present or absent but is heavily skewed toward Arthropoda, while Evolution_df tracks how fast traits evolve across phyla, with a balanced sample but traits that are very uneven and strongly tied together.
Data Preprocessing (Preparations for ML)
Family Below
# Make a copy for preprocessing steps
= family_df.copy()
family_preprocessing
# Drop ID column (Tree_Label)
if "Tree_Label" in family_preprocessing.columns:
= family_preprocessing.drop(columns=["Tree_Label"])
family_preprocessing
family_preprocessing.head()
Phylum | SS | A | G | O | T | V | C | F | K | M | S | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Mollusca | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
1 | Mollusca | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
2 | Chordata | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
3 | Chordata | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
4 | Chordata | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
#strip leading/trailing spaces in Phylum column
"Phylum"] = family_preprocessing["Phylum"].astype(str).str.strip()
family_preprocessing[
#defined superphylum groupings (after some research)
= {
groups "Ecdysozoa": {
"Arthropoda", "Nematoda", "Nematomorpha", "Priapulida",
"Kinorhyncha", "Kinorhynca", "Tardigrada", "Onychophora"
},"Lophotrochozoa": {
"Mollusca", "Annelida", "Brachiopoda", "Bryozoa",
"Phoronida", "Nemertea", "Rotifera", "Gastrotricha",
"Gnathostomulida", "Sipuncula", "Platyhelminthes", "Entoprocta"
},"Deuterostomia": {
"Chordata", "Echinodermata", "Hemichordata"
},"Basal Metazoa & Non-Bilaterians": {
"Porifera", "Placozoa", "Cnidaria", "Ctenophora"
},"Basal Bilateria": {
"Acoela", "Chaetognatha", "Xenoturbellida"
}
}
#mapping dictionary
= {}
phylum_to_group for group_name, phyla in groups.items():
for p in phyla:
= group_name
phylum_to_group[p]
#addedSuperphylum column
"Superphylum"] = family_preprocessing["Phylum"].map(phylum_to_group)
family_preprocessing[
# Coverage check
= set(family_preprocessing["Phylum"].unique())
unique_phyla = [p for p in unique_phyla if p in phylum_to_group]
assigned = [p for p in unique_phyla if p not in phylum_to_group]
unassigned
= pd.DataFrame({
df_map "Phylum": sorted(unique_phyla),
"Assigned_Group": [phylum_to_group.get(p, "") for p in sorted(unique_phyla)],
"Is_Assigned": [p in phylum_to_group for p in sorted(unique_phyla)]
})
= {
summary "total_unique_phyla": len(unique_phyla),
"assigned_count": len(assigned),
"unassigned_count": len(unassigned),
"unassigned_list": unassigned
}
#quick check if there are any unassigned (none of them so it's good)
summary
#info
family_preprocessing.info()
#header
print(family_preprocessing.head())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Phylum 1087 non-null object
1 SS 1087 non-null int64
2 A 1087 non-null int64
3 G 1087 non-null int64
4 O 1087 non-null int64
5 T 1087 non-null int64
6 V 1087 non-null int64
7 C 1087 non-null int64
8 F 1087 non-null int64
9 K 1087 non-null int64
10 M 1087 non-null int64
11 S 1087 non-null int64
12 Superphylum 1087 non-null object
dtypes: int64(11), object(2)
memory usage: 110.5+ KB
Phylum SS A G O T V C F K M S Superphylum
0 Mollusca 0 0 0 0 0 0 0 0 0 0 0 Lophotrochozoa
1 Mollusca 0 0 0 0 0 0 0 0 0 0 0 Lophotrochozoa
2 Chordata 0 0 0 0 0 0 0 0 0 0 0 Deuterostomia
3 Chordata 0 0 0 0 0 0 0 0 0 0 0 Deuterostomia
4 Chordata 0 0 0 0 0 0 0 0 0 0 0 Deuterostomia
# encode Superphylum target
= LabelEncoder()
sup_le "Superphylum_encoded"] = sup_le.fit_transform(
family_preprocessing["Superphylum"].astype(str)
family_preprocessing[
)
# features = trait flags only (exclude Superphylum columns)
= ["SS","A","G","O","T","V","C","F","K","M","S"]
trait_cols = family_preprocessing[trait_cols].copy()
X = family_preprocessing["Superphylum_encoded"].copy()
y
print("Classes:", dict(enumerate(sup_le.classes_)))
Classes: {0: 'Basal Bilateria', 1: 'Basal Metazoa & Non-Bilaterians', 2: 'Deuterostomia', 3: 'Ecdysozoa', 4: 'Lophotrochozoa'}
#train/test split
= train_test_split(
X_train_fam, X_test_fam, y_train_fam, y_test_fam =0.2, random_state=42, stratify=y
X, y, test_size
)
#class weights
= np.unique(y_train_fam)
classes = compute_class_weight("balanced", classes=classes, y=y_train_fam)
class_weights = {c: w for c, w in zip(classes, class_weights)}
cw_fam
#standardize features below
= StandardScaler()
scaler_fam = scaler_fam.fit_transform(X_train_fam)
X_train_fam_scaled = scaler_fam.transform(X_test_fam)
X_test_fam_scaled
print("Train/Test:", X_train_fam.shape, X_test_fam.shape)
print("Class weights:", cw_fam)
#after preprocessing, making a copy of the preprocessed data -> family_ML for model evaluations
= family_preprocessing.copy() family_ML
Train/Test: (869, 11) (218, 11)
Class weights: {np.int64(0): np.float64(57.93333333333333), np.int64(1): np.float64(19.31111111111111), np.int64(2): np.float64(3.697872340425532), np.int64(3): np.float64(0.23423180592991913), np.int64(4): np.float64(2.5558823529411763)}
Evolution Below
#making a copy for preprocessing steps
= evolution_df.copy()
evolution_preprocessing
#Drop ID column (Tree_label)
if "Tree" in evolution_preprocessing.columns:
= evolution_preprocessing.drop(columns=["Tree"])
evolution_preprocessing
evolution_preprocessing.head()
Phylum | A | G | O | T | V | C | F | K | M | S | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | Tardigrada | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
1 | Onychophora | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
2 | Arthropoda | 0.000228 | 0.000244 | 0.000376 | 0.00118 | 0.000935 | 0.000995 | 0.001178 | 0.000043 | 0.000509 | 0.000108 |
3 | Nematoda | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
4 | Nematomorpha | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
#stripped whitespace from Phylum names
"Phylum"] = evolution_preprocessing["Phylum"].astype(str).str.strip()
evolution_preprocessing[
#defined superphylum groupings
= {
groups "Ecdysozoa": {
"Arthropoda", "Nematoda", "Nematomorpha", "Priapulida",
"Kinorhyncha", "Kinorhynca", "Tardigrada", "Onychophora"
},"Lophotrochozoa": {
"Mollusca", "Annelida", "Brachiopoda", "Bryozoa",
"Phoronida", "Nemertea", "Rotifera", "Gastrotricha",
"Gnathostomulida", "Sipuncula", "Platyhelminthes", "Entoprocta"
},"Deuterostomia": {
"Chordata", "Echinodermata", "Hemichordata"
},"Basal Metazoa & Non-Bilaterians": {
"Porifera", "Placozoa", "Cnidaria", "Ctenophora"
},"Basal Bilateria": {
"Acoela", "Chaetognatha", "Xenoturbellida"
}
}
#mapping dictionary
= {}
phylum_to_group for group_name, phyla in groups.items():
for p in phyla:
= group_name
phylum_to_group[p]
#added Superphylum column
"Superphylum"] = evolution_preprocessing["Phylum"].map(phylum_to_group)
evolution_preprocessing[
#verify coverage below
= set(evolution_preprocessing["Phylum"].unique())
unique_phyla = [p for p in unique_phyla if p not in phylum_to_group]
unassigned
print("Total unique phyla:", len(unique_phyla))
print("Unassigned phyla (should be empty):", unassigned)
evolution_preprocessing.head()
Total unique phyla: 28
Unassigned phyla (should be empty): []
Phylum | A | G | O | T | V | C | F | K | M | S | Superphylum | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Tardigrada | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Ecdysozoa |
1 | Onychophora | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Ecdysozoa |
2 | Arthropoda | 0.000228 | 0.000244 | 0.000376 | 0.00118 | 0.000935 | 0.000995 | 0.001178 | 0.000043 | 0.000509 | 0.000108 | Ecdysozoa |
3 | Nematoda | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Ecdysozoa |
4 | Nematomorpha | 0.000000 | 0.000000 | 0.000000 | 0.00000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | Ecdysozoa |
#encoding superphylum:
#encode Superphylum into numeric labels, keep the text column
= LabelEncoder()
encoder "Superphylum_encoded"] = encoder.fit_transform(
evolution_preprocessing["Superphylum"].astype(str)
evolution_preprocessing[
)
#mapping for reference
= dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
superphylum_classes print("Superphylum encoding mapping:", superphylum_classes)
"Phylum", "Superphylum", "Superphylum_encoded"]].head() evolution_preprocessing[[
Superphylum encoding mapping: {'Basal Bilateria': np.int64(0), 'Basal Metazoa & Non-Bilaterians': np.int64(1), 'Deuterostomia': np.int64(2), 'Ecdysozoa': np.int64(3), 'Lophotrochozoa': np.int64(4)}
Phylum | Superphylum | Superphylum_encoded | |
---|---|---|---|
0 | Tardigrada | Ecdysozoa | 3 |
1 | Onychophora | Ecdysozoa | 3 |
2 | Arthropoda | Ecdysozoa | 3 |
3 | Nematoda | Ecdysozoa | 3 |
4 | Nematomorpha | Ecdysozoa | 3 |
# encoding and log + scaling
#encode Superphylum (target)
= LabelEncoder()
superphylum_le_evo "Superphylum_encoded"] = superphylum_le_evo.fit_transform(
evolution_preprocessing["Superphylum"].astype(str)
evolution_preprocessing[
)
#defined features:
#continuous rates only (drop phylum target completely)
= ["A","G","O","T","V","C","F","K","M","S"]
rate_cols = evolution_preprocessing[rate_cols]
X_rates = evolution_preprocessing["Superphylum_encoded"]
y_evo
#Log transform the rates
= np.log1p(X_rates)
X_log
#standardized features below
= StandardScaler()
scaler = scaler.fit_transform(X_log)
X_scaled
#final features are just scaled rates
= X_scaled
X_evo_final
#one per class in test (stratify on superphyla now)
= train_test_split(
X_train_evo, X_test_evo, y_train_evo, y_test_evo
X_evo_final, y_evo,=1/3,
test_size=42,
random_state=y_evo
stratify
)
#mapping for decoding predictions later
= {i: n for i, n in enumerate(superphylum_le_evo.classes_)}
superphylum_id_to_name_evo
print("Train/Test shapes:", X_train_evo.shape, X_test_evo.shape)
print("Unique superphyla:", len(superphylum_id_to_name_evo))
print("Example mapping (id -> name):", list(superphylum_id_to_name_evo.items())[:5])
#evolution_preprocessing copy into evolution_ML
= evolution_preprocessing.copy()
evolution_ML
#header
print(evolution_ML.head())
Train/Test shapes: (56, 10) (28, 10)
Unique superphyla: 5
Example mapping (id -> name): [(0, 'Basal Bilateria'), (1, 'Basal Metazoa & Non-Bilaterians'), (2, 'Deuterostomia'), (3, 'Ecdysozoa'), (4, 'Lophotrochozoa')]
Phylum A G O T V C \
0 Tardigrada 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000
1 Onychophora 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000
2 Arthropoda 0.000228 0.000244 0.000376 0.00118 0.000935 0.000995
3 Nematoda 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000
4 Nematomorpha 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000
F K M S Superphylum Superphylum_encoded
0 0.000000 0.000000 0.000000 0.000000 Ecdysozoa 3
1 0.000000 0.000000 0.000000 0.000000 Ecdysozoa 3
2 0.001178 0.000043 0.000509 0.000108 Ecdysozoa 3
3 0.000000 0.000000 0.000000 0.000000 Ecdysozoa 3
4 0.000000 0.000000 0.000000 0.000000 Ecdysozoa 3
Modeling Evaluations
Family Modeling
# =========================================================
#Family below
= 42 # keep consistent
RANDOM_STATE
# Features & Target
= ["SS","A","G","O","T","V","C","F","K","M","S"] # exclude Superphylum_encoded
trait_cols_fam = family_ML[trait_cols_fam].copy()
X_fam_all = family_ML["Superphylum_encoded"].copy() # target is Superphylum
y_fam_all
# Stratified Split
= train_test_split(
X_train_fam, X_test_fam, y_train_fam, y_test_fam =0.2, random_state=RANDOM_STATE, stratify=y_fam_all
X_fam_all, y_fam_all, test_size
)
# Class Weights (from TRAIN)
= np.unique(y_train_fam)
classes_fam = compute_class_weight("balanced", classes=classes_fam, y=y_train_fam)
weights_fam = {c: w for c, w in zip(classes_fam, weights_fam)}
cw_fam
# Scale for LR only
= StandardScaler()
scaler_fam = scaler_fam.fit_transform(X_train_fam)
X_train_fam_scaled = scaler_fam.transform(X_test_fam)
X_test_fam_scaled
# Models
= DecisionTreeClassifier(
dt_fam =RANDOM_STATE,
random_state=cw_fam,
class_weight=8,
max_depth=4,
min_samples_split=2
min_samples_leaf
)
= RandomForestClassifier(
rf_fam =RANDOM_STATE,
random_state="balanced_subsample",
class_weight=300,
n_estimators=12,
max_depth=4,
min_samples_split=2,
min_samples_leaf=1 # <-- disable parallelism
n_jobs
)
= LogisticRegression(
lr_fam =2000,
max_iter=RANDOM_STATE,
random_state=cw_fam,
class_weight="multinomial",
multi_class=0.5,
C="lbfgs",
solver=1 # <-- disable parallelism
n_jobs
)
# Train
dt_fam.fit(X_train_fam, y_train_fam)
rf_fam.fit(X_train_fam, y_train_fam)
lr_fam.fit(X_train_fam_scaled, y_train_fam)
# Predictions
= dt_fam.predict(X_test_fam)
dt_pred_fam = rf_fam.predict(X_test_fam)
rf_pred_fam = lr_fam.predict(X_test_fam_scaled)
lr_pred_fam
# Evaluation
print("=== FAMILY (Superphylum) RESULTS ===")
= np.unique(y_test_fam)
labels_eval
for name, pred in [
"Decision Tree", dt_pred_fam),
("Random Forest", rf_pred_fam),
("Logistic Regression", lr_pred_fam),
(
]:= accuracy_score(y_test_fam, pred)
acc = balanced_accuracy_score(y_test_fam, pred)
balc = f1_score(y_test_fam, pred, average="macro", labels=labels_eval, zero_division=0)
f1m print(f"\n{name}: acc={acc:.3f} | bal_acc={balc:.3f} | macro-F1={f1m:.3f}")
print(classification_report(y_test_fam, pred, labels=labels_eval, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test_fam, pred, labels=labels_eval))
# CV on random forest
= pd.Series(y_fam_all).value_counts().min()
min_count_fam if min_count_fam >= 3:
= int(min(5, min_count_fam))
n_splits_fam = StratifiedKFold(n_splits=n_splits_fam, shuffle=True, random_state=RANDOM_STATE)
cv_folds_fam = cross_val_score(
fam_cv_rf =cv_folds_fam, scoring="f1_macro", n_jobs=1 # <-- disable parallelism
rf_fam, X_fam_all, y_fam_all, cv
)print(f"\n[FAMILY Superphylum] RF {n_splits_fam}-fold CV macro-F1: {fam_cv_rf.mean():.3f} ± {fam_cv_rf.std():.3f}")
else:
print("\n[FAMILY Superphylum] Skipping CV: smallest class < 3 samples.")
/Users/matthewthompson/micromamba/envs/data_science_foundation/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1272: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. From then on, it will always use 'multinomial'. Leave it to its default value to avoid this warning.
warnings.warn(
=== FAMILY (Superphylum) RESULTS ===
Decision Tree: acc=0.147 | bal_acc=0.280 | macro-F1=0.087
precision recall f1-score support
0 0.01 1.00 0.01 1
1 0.00 0.00 0.00 2
2 0.12 0.25 0.16 12
3 0.93 0.15 0.26 186
4 0.00 0.00 0.00 17
accuracy 0.15 218
macro avg 0.21 0.28 0.09 218
weighted avg 0.80 0.15 0.23 218
Confusion matrix:
[[ 1 0 0 0 0]
[ 2 0 0 0 0]
[ 8 0 3 1 0]
[136 0 22 28 0]
[ 16 0 0 1 0]]
Random Forest: acc=0.229 | bal_acc=0.283 | macro-F1=0.128
precision recall f1-score support
0 0.00 0.00 0.00 1
1 0.01 1.00 0.02 2
2 0.33 0.17 0.22 12
3 0.94 0.25 0.39 186
4 0.00 0.00 0.00 17
accuracy 0.23 218
macro avg 0.26 0.28 0.13 218
weighted avg 0.82 0.23 0.35 218
Confusion matrix:
[[ 0 1 0 0 0]
[ 0 2 0 0 0]
[ 0 8 2 2 0]
[ 0 136 4 46 0]
[ 0 16 0 1 0]]
Logistic Regression: acc=0.216 | bal_acc=0.296 | macro-F1=0.129
precision recall f1-score support
0 0.01 1.00 0.01 1
1 0.00 0.00 0.00 2
2 0.27 0.25 0.26 12
3 0.98 0.23 0.37 186
4 0.00 0.00 0.00 17
accuracy 0.22 218
macro avg 0.25 0.30 0.13 218
weighted avg 0.85 0.22 0.33 218
Confusion matrix:
[[ 1 0 0 0 0]
[ 2 0 0 0 0]
[ 8 0 3 1 0]
[136 0 7 43 0]
[ 16 0 1 0 0]]
[FAMILY Superphylum] RF 4-fold CV macro-F1: 0.109 ± 0.011
# =========================================================
# Summary of FAMILY (Superphylum) Results
print("=== FAMILY (Superphylum) SUMMARY ===")
print("\nDecision Tree")
print(" - Accuracy: 0.147")
print(" - Balanced Accuracy: 0.280")
print(" - Macro F1: 0.087")
print(" - Strong bias toward majority class, poor minority performance.")
print("\nRandom Forest")
print(" - Accuracy: 0.229")
print(" - Balanced Accuracy: 0.283")
print(" - Macro F1: 0.128")
print(" - Slightly better than Decision Tree, but still dominated by majority class.")
print("\nLogistic Regression")
print(" - Accuracy: 0.216")
print(" - Balanced Accuracy: 0.296")
print(" - Macro F1: 0.129")
print(" - Similar to Random Forest, recalls minority class a little better.")
print("\nCross-Validation (Random Forest)")
print(" - 4-fold CV Macro F1: 0.109 ± 0.011")
print(" - Confirms weak generalization across classes.")
=== FAMILY (Superphylum) SUMMARY ===
Decision Tree
- Accuracy: 0.147
- Balanced Accuracy: 0.280
- Macro F1: 0.087
- Strong bias toward majority class, poor minority performance.
Random Forest
- Accuracy: 0.229
- Balanced Accuracy: 0.283
- Macro F1: 0.128
- Slightly better than Decision Tree, but still dominated by majority class.
Logistic Regression
- Accuracy: 0.216
- Balanced Accuracy: 0.296
- Macro F1: 0.129
- Similar to Random Forest, recalls minority class a little better.
Cross-Validation (Random Forest)
- 4-fold CV Macro F1: 0.109 ± 0.011
- Confirms weak generalization across classes.
Evolution Modeling
# =========================================================
#Evolution below
= ["A","G","O","T","V","C","F","K","M","S"]
rate_cols
# build modeling matrix from evolution_ML
= evolution_ML[rate_cols].copy()
X_rates_all = evolution_ML[["Superphylum_encoded"]].copy()
X_super_all = evolution_ML["Superphylum_encoded"].copy() # <— target is Superphylum now
y_evo_all
# log1p rates
= np.log1p(X_rates_all)
X_log_all
# z-score rates
= StandardScaler()
scaler_evo = scaler_evo.fit_transform(X_log_all)
X_scaled_all
# stack with unscaled Superphylum code
= X_scaled_all
X_evo_all
# stratified split 2:1 (1/3 test)
= train_test_split(
X_train_evo, X_test_evo, y_train_evo, y_test_evo =1/3, random_state=RANDOM_STATE, stratify=y_evo_all
X_evo_all, y_evo_all, test_size
)
# models
= DecisionTreeClassifier(random_state=RANDOM_STATE, class_weight="balanced")
dt_evo = RandomForestClassifier(random_state=RANDOM_STATE, class_weight="balanced_subsample")
rf_evo = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE, class_weight="balanced", multi_class="ovr")
lr_evo
# train & predict
dt_evo.fit(X_train_evo, y_train_evo)
rf_evo.fit(X_train_evo, y_train_evo)# already scaled representation
lr_evo.fit(X_train_evo, y_train_evo)
= dt_evo.predict(X_test_evo)
dt_pred_evo = rf_evo.predict(X_test_evo)
rf_pred_evo = lr_evo.predict(X_test_evo)
lr_pred_evo
print("\n=== EVOLUTION RESULTS ===")
for name, pred in [("Decision Tree", dt_pred_evo), ("Random Forest", rf_pred_evo), ("Logistic Regression", lr_pred_evo)]:
= accuracy_score(y_test_evo, pred)
acc = balanced_accuracy_score(y_test_evo, pred)
bal = f1_score(y_test_evo, pred, average="macro")
f1m print(f"\n{name}: acc={acc:.3f} | bal_acc={bal:.3f} | macro-F1={f1m:.3f}")
print(classification_report(y_test_evo, pred, zero_division=0))
print("Confusion matrix:\n", confusion_matrix(y_test_evo, pred))
= pd.Series(y_evo_all).value_counts().min()
min_count_evo if min_count_evo >= 2:
= int(min(5, min_count_evo))
n_splits_evo = StratifiedKFold(n_splits=n_splits_evo, shuffle=True, random_state=RANDOM_STATE)
cv_folds_evo = cross_val_score(
evo_cv_rf =RANDOM_STATE, class_weight="balanced_subsample"),
RandomForestClassifier(random_state=cv_folds_evo, scoring="f1_macro"
X_evo_all, y_evo_all, cv
)print(f"\n[EVOLUTION] RF {n_splits_evo}-fold CV macro-F1: {evo_cv_rf.mean():.3f} ± {evo_cv_rf.std():.3f}")
else:
print("\n[EVOLUTION] Skipping CV: smallest class has < 2 samples, cannot stratify.")
=== EVOLUTION RESULTS ===
Decision Tree: acc=0.143 | bal_acc=0.229 | macro-F1=0.093
precision recall f1-score support
0 0.12 1.00 0.21 3
1 0.00 0.00 0.00 4
2 0.00 0.00 0.00 3
3 1.00 0.14 0.25 7
4 0.00 0.00 0.00 11
accuracy 0.14 28
macro avg 0.22 0.23 0.09 28
weighted avg 0.26 0.14 0.09 28
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 0 0 1]
[ 5 0 1 1 0]
[11 0 0 0 0]]
Random Forest: acc=0.214 | bal_acc=0.324 | macro-F1=0.232
precision recall f1-score support
0 0.12 1.00 0.21 3
1 0.00 0.00 0.00 4
2 1.00 0.33 0.50 3
3 1.00 0.29 0.44 7
4 0.00 0.00 0.00 11
accuracy 0.21 28
macro avg 0.42 0.32 0.23 28
weighted avg 0.37 0.21 0.19 28
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 1 0 0]
[ 5 0 0 2 0]
[11 0 0 0 0]]
Logistic Regression: acc=0.500 | bal_acc=0.324 | macro-F1=0.311
precision recall f1-score support
0 0.00 0.00 0.00 3
1 0.00 0.00 0.00 4
2 1.00 0.33 0.50 3
3 1.00 0.29 0.44 7
4 0.44 1.00 0.61 11
accuracy 0.50 28
macro avg 0.49 0.32 0.31 28
weighted avg 0.53 0.50 0.40 28
Confusion matrix:
[[ 0 0 0 0 3]
[ 0 0 0 0 4]
[ 0 0 1 0 2]
[ 0 0 0 2 5]
[ 0 0 0 0 11]]
/Users/matthewthompson/micromamba/envs/data_science_foundation/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1281: FutureWarning: 'multi_class' was deprecated in version 1.5 and will be removed in 1.7. Use OneVsRestClassifier(LogisticRegression(..)) instead. Leave it to its default value to avoid this warning.
warnings.warn(
[EVOLUTION] RF 5-fold CV macro-F1: 0.193 ± 0.065
print("""
=== EVOLUTION RESULTS ===
Decision Tree: acc=0.143 | bal_acc=0.229 | macro-F1=0.093
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 0 0 1]
[ 5 0 1 1 0]
[11 0 0 0 0]]
Random Forest: acc=0.214 | bal_acc=0.324 | macro-F1=0.232
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 1 0 0]
[ 5 0 0 2 0]
[11 0 0 0 0]]
Logistic Regression: acc=0.500 | bal_acc=0.324 | macro-F1=0.311
Confusion matrix:
[[ 0 0 0 0 3]
[ 0 0 0 0 4]
[ 0 0 1 0 2]
[ 0 0 0 2 5]
[ 0 0 0 0 11]]
[EVOLUTION] RF 5-fold CV macro-F1: 0.193 ± 0.065
""")
=== EVOLUTION RESULTS ===
Decision Tree: acc=0.143 | bal_acc=0.229 | macro-F1=0.093
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 0 0 1]
[ 5 0 1 1 0]
[11 0 0 0 0]]
Random Forest: acc=0.214 | bal_acc=0.324 | macro-F1=0.232
Confusion matrix:
[[ 3 0 0 0 0]
[ 4 0 0 0 0]
[ 2 0 1 0 0]
[ 5 0 0 2 0]
[11 0 0 0 0]]
Logistic Regression: acc=0.500 | bal_acc=0.324 | macro-F1=0.311
Confusion matrix:
[[ 0 0 0 0 3]
[ 0 0 0 0 4]
[ 0 0 1 0 2]
[ 0 0 0 2 5]
[ 0 0 0 0 11]]
[EVOLUTION] RF 5-fold CV macro-F1: 0.193 ± 0.065
# =========================================================
#side by side comparison
#def _summ(dataset, name, y_true, y_pred):
def _summ(dataset, name, y_true, y_pred):
return {
"dataset": dataset,
"model": name,
"acc": accuracy_score(y_true, y_pred),
"bal_acc": balanced_accuracy_score(y_true, y_pred),
"macro_F1": f1_score(y_true, y_pred, average="macro"),
}
#rows
= []
rows += [
rows "family", "Decision Tree", y_test_fam, dt_pred_fam),
_summ("family", "Random Forest", y_test_fam, rf_pred_fam),
_summ("family", "Logistic Regression", y_test_fam, lr_pred_fam),
_summ(
]+= [
rows "evolution", "Decision Tree", y_test_evo, dt_pred_evo),
_summ("evolution", "Random Forest", y_test_evo, rf_pred_evo),
_summ("evolution", "Logistic Regression", y_test_evo, lr_pred_evo),
_summ(
]
#long → wide (one row per model, side-by-side metrics)
= pd.DataFrame(rows)
comp_long = (
comp_wide
comp_long="model", columns="dataset", values=["acc", "bal_acc", "macro_F1"])
.pivot(index
.sort_index()
)
#printing
print("\n=== FAMILY vs EVOLUTION — MODEL COMPARISON ===")
print(comp_wide.round(3))
=== FAMILY vs EVOLUTION — MODEL COMPARISON ===
acc bal_acc macro_F1
dataset evolution family evolution family evolution family
model
Decision Tree 0.143 0.147 0.229 0.280 0.093 0.087
Logistic Regression 0.500 0.216 0.324 0.296 0.311 0.129
Random Forest 0.214 0.229 0.324 0.283 0.232 0.128
Logistic Regression stood out on the evolution dataset, reaching 50% accuracy and the highest macro-F1, while both Decision Trees and Random Forests performed poorly across the board. The family dataset was especially weak, with all models struggling to separate classes, which suggests that simple binary trait presence/absence doesn’t carry much predictive power. In contrast, the rate-based features in the evolution data seem to hold more useful signal, even if still limited.
# =========================================================
# SHAP interpretability
shap.initjs()
# -----------------------
# FAMILY
# -----------------------
= shap.TreeExplainer(rf_fam)
explainer_fam = explainer_fam.shap_values(X_test_fam)
shap_values_fam
= X_test_fam.values
Xtf = list(X_test_fam.columns)
fn_fam
#Convert shap_values into consistent 2D form (N samples × F features)
#If multi-class, aggregate absolute contributions across all classes
if isinstance(shap_values_fam, list):
= np.array(shap_values_fam) # (C, N, F)
sv_fam = np.transpose(sv_fam, (1, 2, 0)) # (N, F, C)
sv_fam elif isinstance(shap_values_fam, np.ndarray) and shap_values_fam.ndim == 3:
= shap_values_fam # (N, F, C)
sv_fam else:
= np.array(shap_values_fam) # (N, F)
sv_fam
# Collapse to 2D by summing absolute contributions across classes
if sv_fam.ndim == 3:
= np.sum(np.abs(sv_fam), axis=2) # (N, F)
sv2d_fam else:
= sv_fam
sv2d_fam
#plots shap plot of family
print("\n[SHAP] Random Forest — FAMILY")
=fn_fam, show=True)
shap.summary_plot(sv2d_fam, Xtf, feature_names
# quick text summary of the top contributors (mean |SHAP| per feature)
= np.abs(sv2d_fam).mean(axis=0)
fam_importance = np.argsort(fam_importance)[::-1]
fam_top_idx print("\n[SHAP] FAMILY — top features by average absolute contribution")
for rank in range(min(10, len(fn_fam))):
= fam_top_idx[rank]
j print(f"{rank+1:>2}. {fn_fam[j]} — mean|SHAP|={fam_importance[j]:.4f}")
# --------------------------
# EVOLUTION
# --------------------------
= shap.TreeExplainer(rf_evo)
explainer_evo = explainer_evo.shap_values(X_test_evo)
shap_values_evo
# full names (used inside the model)
= [f"log1p_{c}_z" for c in rate_cols]
evo_feature_names = np.asarray(X_test_evo)
Xte
# pretty names for display only
= rate_cols # ["A","G","O","T","V","C","F","K","M","S"]
evo_feature_pretty
# Convert SHAP values into 2D array
if isinstance(shap_values_evo, list):
= np.array(shap_values_evo) # (C, N, F)
sv_evo = np.transpose(sv_evo, (1, 2, 0)) # (N, F, C)
sv_evo elif isinstance(shap_values_evo, np.ndarray) and shap_values_evo.ndim == 3:
= shap_values_evo # (N, F, C)
sv_evo else:
= np.array(shap_values_evo) # (N, F)
sv_evo
# collapse to 2D for global plot
if sv_evo.ndim == 3:
= np.sum(np.abs(sv_evo), axis=2) # (N, F)
sv2d_evo else:
= sv_evo
sv2d_evo
#SHAP summary plot with clean labels
print("\n[SHAP] Random Forest — EVOLUTION (global)")
=evo_feature_pretty, show=True)
shap.summary_plot(sv2d_evo, Xte, feature_names
# quick text summary with clean labels
= np.abs(sv2d_evo).mean(axis=0)
evo_importance = np.argsort(evo_importance)[::-1]
evo_top_idx print("\n[SHAP] EVOLUTION — top features by average absolute contribution")
for rank in range(min(10, len(evo_feature_pretty))):
= evo_top_idx[rank]
j print(f"{rank+1:>2}. {evo_feature_pretty[j]} — mean|SHAP|={evo_importance[j]:.4f}")
# (N, F)
[SHAP] Random Forest — FAMILY
[SHAP] FAMILY — top features by average absolute contribution
1. SS — mean|SHAP|=0.1858
2. F — mean|SHAP|=0.1131
3. T — mean|SHAP|=0.0595
4. C — mean|SHAP|=0.0525
5. M — mean|SHAP|=0.0441
6. V — mean|SHAP|=0.0289
7. O — mean|SHAP|=0.0192
8. A — mean|SHAP|=0.0158
9. G — mean|SHAP|=0.0062
10. S — mean|SHAP|=0.0026
[SHAP] Random Forest — EVOLUTION (global)
[SHAP] EVOLUTION — top features by average absolute contribution
1. V — mean|SHAP|=0.0802
2. C — mean|SHAP|=0.0550
3. A — mean|SHAP|=0.0398
4. F — mean|SHAP|=0.0365
5. K — mean|SHAP|=0.0310
6. M — mean|SHAP|=0.0309
7. O — mean|SHAP|=0.0262
8. T — mean|SHAP|=0.0190
9. G — mean|SHAP|=0.0111
10. S — mean|SHAP|=0.0089
- For the family dataset, the single binary feature SS (sexual selection) dominates model predictions by a large margin, followed by traits like F (female choice) and T (territoriality). Most other traits have smaller and often negligible contributions. This suggests the model mostly keys off one or two strong binary indicators, which likely limits generalization.
- For the evolution dataset, importance is spread across several continuous rate features. V (visual traits) has the largest average impact, followed by C (competition), A (acoustic), and F (female choice). Unlike family, no single variable dominates completely — instead, multiple rates contribute modestly, making predictions more balanced.
Overall, the family model leans heavily on a few binary traits (especially SS), while the evolution model distributes influence across a wider set of rate-based features, indicating that continuous rates carry richer predictive signal even if the model’s raw accuracy remains modest
Conclusion and Interpretations
Family_df (binary presence/absence) The EDA showed that the dataset is dominated by Arthropoda, with most phyla underrepresented and most traits very sparse. Only SS (any sexually selected trait) and F (female choice) appeared often enough to matter. The models reflect this: none performed well because there’s little variation for them to learn from, and both trees and forests overfit while logistic regression also struggled. SHAP confirms it — SS drives most of the predictions, with F and a few others adding smaller contributions, while the rest of the traits are almost irrelevant. This means the model is clinging to the only traits that vary enough, which limits generalization.
Evolution_df (rate-based traits) Here, the EDA showed a perfectly balanced set of phyla but very skewed and correlated trait distributions, with V (visual) and C (male–male competition) standing out. The models lined up with this: Logistic Regression reached about 50% accuracy and the highest macro-F1, making it the best of the lot, while trees and forests struggled with the skew and outliers. SHAP also supports this picture — multiple features (V, C, A for auditory, and F for female choice) all contributed meaningfully, with no single one dominating. This makes the predictions more balanced and shows that continuous rate data carries a stronger, though still limited, signal compared to simple presence/absence.
Overall interpretation The binary data in family_df is too imbalanced and sparse to support good predictions, so models mostly latch onto SS and perform poorly. In contrast, evolution_df provides balanced phyla and richer rate-based features, allowing Logistic Regression to find some signal and SHAP to highlight a broader set of important traits.