@step(enable_cache=True)
def load_data() -> Annotated[DatasetBundle, “raw_dataset”]:
knowledge = load_breast_cancer()
return DatasetBundle(
knowledge.knowledge, knowledge.goal, knowledge.feature_names,
stats={“supply”: “sklearn.datasets.load_breast_cancer”},
)
@step
def split_and_scale(
bundle: DatasetBundle,
test_size: float = 0.2,
random_state: int = 42,
) -> Tuple[
Annotated[np.ndarray, “X_train”],
Annotated[np.ndarray, “X_test”],
Annotated[np.ndarray, “y_train”],
Annotated[np.ndarray, “y_test”],
]:
X_tr, X_te, y_tr, y_te = train_test_split(
bundle.X, bundle.y, test_size=test_size,
random_state=random_state, stratify=bundle.y,
)
scaler = StandardScaler().match(X_tr)
X_tr, X_te = scaler.remodel(X_tr), scaler.remodel(X_te)
log_metadata(metadata={“train_size”: len(X_tr), “test_size”: len(X_te)})
return X_tr, X_te, y_tr, y_te
@step
def train_candidate(
X_train: np.ndarray,
y_train: np.ndarray,
model_type: str = “random_forest”,
n_estimators: int = 100,
max_depth: int = 5,
) -> Annotated[Any, “candidate_model”]:
if model_type == “random_forest”:
m = RandomForestClassifier(n_estimators=n_estimators,
max_depth=max_depth, random_state=42)
elif model_type == “gradient_boosting”:
m = GradientBoostingClassifier(n_estimators=n_estimators,
max_depth=max_depth, random_state=42)
else:
m = LogisticRegression(max_iter=2000, random_state=42)
m.match(X_train, y_train)
log_metadata(metadata={
“model_type”: model_type,
“hyperparameters”: {“n_estimators”: n_estimators, “max_depth”: max_depth},
})
return m
