Skip to content

Tuning LightGBM With GASearchCV

LightGBM grows trees leaf-wise rather than level-wise, which makes num_leaves its most important — and most dangerous — parameter. A large num_leaves with shallow max_depth is wasteful; a large num_leaves with deep trees overfits fast. This tutorial searches the joint space with GASearchCV, shows the real gain over LightGBM's defaults, and visualizes the num_leaves / max_depth interaction the search learns to respect.

Prerequisites

bash
pip install sklearn-genetic-opt lightgbm

A Noisy Dataset

Defaults overfit when the signal is weak and the data is noisy, so we build exactly that: 30 features, 8 informative, label noise, overlapping clusters.

python
import warnings
import time

import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split

from sklearn_genetic import (
    EvolutionConfig,
    GASearchCV,
    OptimizationConfig,
    PopulationConfig,
    RuntimeConfig,
)
from sklearn_genetic.callbacks import ConsecutiveStopping, TimerStopping
from sklearn_genetic.schedules import ExponentialAdapter, InverseAdapter
from sklearn_genetic.space import Continuous, Integer

warnings.filterwarnings("ignore")
RANDOM_STATE = 42

X, y = make_classification(
    n_samples=2500, n_features=30, n_informative=8, n_redundant=8,
    n_clusters_per_class=3, class_sep=0.6, flip_y=0.08, random_state=RANDOM_STATE,
)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.40, stratify=y, random_state=RANDOM_STATE
)
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)
print(f"train={X_train.shape}  test={X_test.shape}")
text
train=(1500, 30)  test=(1000, 30)

Baseline: LightGBM Defaults

LightGBM manages its own threads, so we set n_jobs=1 on the estimator and verbose=-1 to silence per-iteration logging.

python
def evaluate(name, estimator):
    proba = estimator.predict_proba(X_test)[:, 1]
    pred = estimator.predict(X_test)
    return {
        "model": name,
        "accuracy": round(accuracy_score(y_test, pred), 4),
        "balanced_accuracy": round(balanced_accuracy_score(y_test, pred), 4),
        "roc_auc": round(roc_auc_score(y_test, proba), 4),
    }


baseline = LGBMClassifier(random_state=RANDOM_STATE, n_jobs=1, verbose=-1)
baseline.fit(X_train, y_train)
baseline_metrics = evaluate("LightGBM defaults", baseline)
print(baseline_metrics)
text
{'model': 'LightGBM defaults', 'accuracy': 0.788, 'balanced_accuracy': 0.788, 'roc_auc': 0.8665}

Search Space

The key relationship is num_leaves2^max_depth. We give the search wide ranges for both and let it discover the productive combinations; the scatter plot later shows where they are.

python
param_grid = {
    "n_estimators":      Integer(50, 350),
    "num_leaves":        Integer(8, 255),
    "max_depth":         Integer(3, 14),
    "learning_rate":     Continuous(0.01, 0.3, distribution="log-uniform"),
    "min_child_samples": Integer(5, 100),
    "subsample":         Continuous(0.5, 1.0),
    "colsample_bytree":  Continuous(0.4, 1.0),
    "reg_alpha":         Continuous(1e-5, 10.0, distribution="log-uniform"),
    "reg_lambda":        Continuous(1e-5, 10.0, distribution="log-uniform"),
}

CPU oversubscription

Pair n_jobs=1 on the LGBMClassifier with parallel_backend="cv" in RuntimeConfig so the search parallelizes at the fold level rather than multiplying LightGBM's internal threads by the number of candidate workers.

Configure and Run

python
ga_search = GASearchCV(
    estimator=LGBMClassifier(random_state=RANDOM_STATE, n_jobs=1, verbose=-1),
    random_state=RANDOM_STATE,
    param_grid=param_grid,
    scoring="roc_auc",
    cv=cv,
    evolution_config=EvolutionConfig(
        population_size=10,
        generations=8,
        crossover_probability=ExponentialAdapter(initial_value=0.8, end_value=0.4, adaptive_rate=0.15),
        mutation_probability=InverseAdapter(initial_value=0.25, end_value=0.05, adaptive_rate=0.20),
        elitism=True,
        keep_top_k=3,
    ),
    population_config=PopulationConfig(
        initializer="smart",
        warm_start_configs=[{
            "n_estimators": 100, "num_leaves": 31, "max_depth": 7,
            "learning_rate": 0.1, "min_child_samples": 20, "subsample": 1.0,
            "colsample_bytree": 1.0, "reg_alpha": 1e-5, "reg_lambda": 1e-5,
        }],
    ),
    runtime_config=RuntimeConfig(n_jobs=-1, parallel_backend="cv",
                                 use_cache=True, verbose=False),
    optimization_config=OptimizationConfig(
        diversity_control=True, fitness_sharing=True,
        local_search=True, local_search_top_k=2,
    ),
)

callbacks = [
    ConsecutiveStopping(generations=6, metric="fitness_best"),
    TimerStopping(total_seconds=90),
]
started = time.perf_counter()
ga_search.fit(X_train, y_train, callbacks=callbacks)
ga_seconds = time.perf_counter() - started

print(f"Best CV ROC AUC : {ga_search.best_score_:.4f}   (search took {ga_seconds:.0f}s)")
print("Best parameters :")
for key, value in ga_search.best_params_.items():
    print(f"  {key}: {value}")
text
Best CV ROC AUC : 0.8508   (search took 50s)
Best parameters :
  n_estimators: 171
  num_leaves: 179
  max_depth: 11
  learning_rate: 0.019653590081344673
  min_child_samples: 9
  subsample: 0.559939750268555
  colsample_bytree: 0.7222065919875744
  reg_alpha: 1.1130156968433846e-05
  reg_lambda: 0.18328989877515126

Baseline vs Tuned

python
ga_metrics = evaluate("GASearchCV (tuned)", ga_search)
comparison = pd.DataFrame([baseline_metrics, ga_metrics])
print(comparison.to_string(index=False))
print()
print(f"ROC AUC improvement over defaults: "
      f"{ga_metrics['roc_auc'] - baseline_metrics['roc_auc']:+.4f}")
text
             model  accuracy  balanced_accuracy  roc_auc
 LightGBM defaults     0.788              0.788   0.8665
GASearchCV (tuned)     0.803              0.803   0.8863

ROC AUC improvement over defaults: +0.0198

Fitness over generations

python
import matplotlib.pyplot as plt

history = pd.DataFrame(ga_search.history)
fig, ax = plt.subplots(figsize=(9, 4))
ax.plot(history["gen"], history["fitness_best"], marker="o", label="best so far", color="#16a085")
ax.plot(history["gen"], history["fitness"], marker=".", label="generation mean", color="#95a5a6")
ax.set_xlabel("Generation")
ax.set_ylabel("CV ROC AUC")
ax.set_title("LightGBM genetic search — fitness over generations")
ax.legend(frameon=False)
ax.grid(alpha=0.25)
fig.tight_layout()

Best and mean cross-validated ROC AUC over generations

The num_leaves / max_depth interaction

Each evaluated candidate is plotted by its num_leaves and max_depth and colored by CV score. The productive region respects num_leaves ≤ 2^max_depth — the search concentrates there instead of wasting effort on invalid or overfitting combinations.

python
results = pd.DataFrame(ga_search.cv_results_)
fig, ax = plt.subplots(figsize=(8, 5))
sc = ax.scatter(results["param_max_depth"], results["param_num_leaves"],
                c=results["mean_test_score"], cmap="viridis", s=60, edgecolor="white")
depths = np.arange(3, 15)
ax.plot(depths, 2.0 ** depths, "--", color="crimson", label="num_leaves = 2^max_depth")
ax.set_ylim(0, 270)
ax.set_xlabel("max_depth")
ax.set_ylabel("num_leaves")
ax.set_title("Evaluated candidates, colored by CV ROC AUC")
ax.legend(frameon=False)
fig.colorbar(sc, label="mean CV ROC AUC")
fig.tight_layout()

Scatter of candidates over max_depth and num_leaves with the 2^max_depth boundary

Practical Notes

  • num_leaves is LightGBM's primary complexity knob — always tune it together with max_depth, never in isolation.
  • Pair n_jobs=1 on the estimator with parallel_backend="cv" to avoid CPU oversubscription.
  • The headline win is tuning vs the default model; the search finds a configuration that generalizes better on noisy data.
  • verbose=-1 keeps LightGBM quiet so the generation log stays readable.

See Also

Released under the MIT License.