Azure Machine Learning is Microsoft’s cloud ML platform. pip install azure-ai-ml azure-identity. ml_client = MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace). CLI v2: az ml job create -f job.yaml. Training: command(code="./src", command="python train.py ${{inputs.train_data}}", environment="azureml:sklearn-env:1", compute="cpu-cluster", inputs={"train_data": Input(type="uri_folder", path="azureml:churn-train:1")}). Environment: Environment(name="sklearn-env", conda_file="conda.yaml", image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04"). Compute: AmlCompute(name="cpu-cluster", size="STANDARD_DS3_V2", min_instances=0, max_instances=4). MLflow autolog: mlflow.sklearn.autolog() inside the training script logs params/metrics/model automatically. Model registration: ml_client.models.create_or_update(Model(path="./outputs/model", name="churn-model", type=AssetTypes.MLFLOW_MODEL)). Online Endpoint: ManagedOnlineEndpoint(name="churn-endpoint", auth_mode="key"), then ManagedOnlineDeployment(name="blue", endpoint_name="churn-endpoint", model="azureml:churn-model:1", instance_type="Standard_DS2_v2", instance_count=1). Traffic split: endpoint.traffic = {"blue": 80, "green": 20}. Batch endpoint: BatchEndpoint, BatchDeployment with min_instances/max_instances. ml_client.jobs.stream(job_name) tails logs. Pipeline job: pipeline_job = pipeline(train=train_step, evaluate=eval_step), ml_client.jobs.create_or_update(pipeline_job). Feature Store: fs_client = FeatureStoreClient(...), feature_set_spec.yaml with source and feature_transformation_code. Claude Code generates Azure ML job YAMLs, training scripts, endpoint configs, pipeline definitions, and TypeScript prediction clients.
CLAUDE.md for Azure ML
## Azure ML Stack
- SDK: azure-ai-ml >= 1.15 + azure-identity
- Client: MLClient(DefaultAzureCredential(), subscription_id, resource_group, workspace_name)
- Training: command(code, command, environment, compute, inputs) → ml_client.jobs.create_or_update()
- Environment: Environment(name, conda_file, image) → ml_client.environments.create_or_update()
- Compute: AmlCompute(name, size, min_instances=0, max_instances) → ml_client.compute.begin_create_or_update()
- Endpoint: ManagedOnlineEndpoint + ManagedOnlineDeployment → traffic split dict
- Model: ml_client.models.create_or_update(Model(path, name, type=AssetTypes.MLFLOW_MODEL))
- CLI: az ml job create -f job.yaml / az ml online-endpoint invoke --name ...
Training Script with MLflow
# src/train.py — Azure ML training script with MLflow autolog
from __future__ import annotations
import argparse
import json
import os
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--train_data", type=str, required=True)
parser.add_argument("--test_data", type=str, default="")
parser.add_argument("--n_estimators", type=int, default=200)
parser.add_argument("--learning_rate", type=float, default=0.05)
parser.add_argument("--max_depth", type=int, default=4)
parser.add_argument("--registered_model_name", type=str, default="churn-model")
return parser.parse_args()
def main():
args = parse_args()
# Azure ML + MLflow integration — auto-connects when running in Azure ML job
mlflow.sklearn.autolog()
feature_cols = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
target_col = "churned"
train_df = pd.read_csv(os.path.join(args.train_data, "train.csv"))
test_df = (
pd.read_csv(os.path.join(args.test_data, "test.csv"))
if args.test_data
else train_df.sample(frac=0.2, random_state=42)
)
X_train = train_df[feature_cols].values
y_train = train_df[target_col].values
X_test = test_df[feature_cols].values
y_test = test_df[target_col].values
with mlflow.start_run():
pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", GradientBoostingClassifier(
n_estimators=args.n_estimators,
learning_rate=args.learning_rate,
max_depth=args.max_depth,
random_state=42,
)),
])
pipeline.fit(X_train, y_train)
y_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred = pipeline.predict(X_test)
auc = roc_auc_score(y_test, y_proba)
mlflow.log_metric("auc", auc)
mlflow.log_params({
"n_estimators": args.n_estimators,
"learning_rate": args.learning_rate,
"max_depth": args.max_depth,
})
print(f"\nAUC-ROC: {auc:.4f}")
print(classification_report(y_test, y_pred))
# Register model in Azure ML Model Registry via MLflow
mlflow.sklearn.log_model(
sk_model=pipeline,
artifact_path="model",
registered_model_name=args.registered_model_name,
)
if __name__ == "__main__":
main()
Azure ML SDK v2 Workflow
# ml/azure_ml_workflow.py — training, registration, and endpoint deployment
from __future__ import annotations
import os
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.entities import (
AmlCompute,
Environment,
ManagedOnlineEndpoint,
ManagedOnlineDeployment,
Model,
BuildContext,
)
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
SUBSCRIPTION = os.environ.get("AZURE_SUBSCRIPTION_ID", "")
RESOURCE_GROUP = os.environ.get("AZURE_RESOURCE_GROUP", "ml-rg")
WORKSPACE = os.environ.get("AZURE_ML_WORKSPACE", "ml-workspace")
def get_client() -> MLClient:
return MLClient(
credential=DefaultAzureCredential(),
subscription_id=SUBSCRIPTION,
resource_group_name=RESOURCE_GROUP,
workspace_name=WORKSPACE,
)
# ── Compute ──────────────────────────────────────────────────────────────────
def ensure_compute(ml_client: MLClient, name: str = "cpu-cluster") -> None:
"""Create compute cluster if it doesn't exist."""
try:
ml_client.compute.get(name)
print(f"Compute cluster '{name}' already exists")
except Exception:
cluster = AmlCompute(
name=name,
type="amlcompute",
size="Standard_DS3_v2",
min_instances=0,
max_instances=4,
idle_time_before_scale_down=120,
tier="Dedicated",
)
ml_client.compute.begin_create_or_update(cluster).result()
print(f"Created compute cluster: {name}")
# ── Environment ──────────────────────────────────────────────────────────────
def ensure_environment(ml_client: MLClient) -> str:
"""Register the training environment."""
env = Environment(
name="sklearn-mlflow-env",
version="1",
description="sklearn + MLflow for churn model training",
conda_file={
"name": "sklearn-env",
"channels": ["defaults", "conda-forge"],
"dependencies": [
"python=3.11",
"pip",
{"pip": [
"scikit-learn>=1.2",
"pandas>=2.0",
"mlflow>=2.10",
"azure-ai-ml>=1.15",
]},
],
},
image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest",
)
env = ml_client.environments.create_or_update(env)
return f"azureml:{env.name}:{env.version}"
# ── Training job ─────────────────────────────────────────────────────────────
def submit_training_job(
ml_client: MLClient,
environment_id: str,
compute_name: str = "cpu-cluster",
) -> str:
"""Submit a training command job and return the run name."""
job = command(
code="./src",
command=(
"python train.py "
"--train_data ${{inputs.train_data}} "
"--test_data ${{inputs.test_data}} "
"--n_estimators 200 --learning_rate 0.05 --max_depth 4"
),
environment=environment_id,
compute=compute_name,
display_name="churn-model-training",
experiment_name="churn-classification",
inputs={
"train_data": Input(type="uri_folder", path="azureml:churn-train:1"),
"test_data": Input(type="uri_folder", path="azureml:churn-test:1"),
},
)
returned_job = ml_client.jobs.create_or_update(job)
ml_client.jobs.stream(returned_job.name) # Tail logs
print(f"Job completed: {returned_job.name}")
return returned_job.name
# ── Online Endpoint ───────────────────────────────────────────────────────────
def deploy_online_endpoint(
ml_client: MLClient,
model_name: str = "churn-model",
model_version: str = "1",
) -> str:
"""Create endpoint and deploy model with blue/green split."""
endpoint = ManagedOnlineEndpoint(
name="churn-endpoint",
description="Churn prediction endpoint",
auth_mode="key",
tags={"env": "production"},
)
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
deployment = ManagedOnlineDeployment(
name="blue",
endpoint_name="churn-endpoint",
model=f"azureml:{model_name}:{model_version}",
instance_type="Standard_DS2_v2",
instance_count=1,
liveness_probe={"initial_delay": 10, "period": 10, "timeout": 2},
)
ml_client.online_deployments.begin_create_or_update(deployment).result()
# Route 100% traffic to blue
endpoint = ml_client.online_endpoints.get("churn-endpoint")
endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()
keys = ml_client.online_endpoints.get_keys("churn-endpoint")
print(f"Endpoint deployed. API key: {keys.primary_key[:8]}...")
return "churn-endpoint"
CLI v2 YAML Job
# jobs/train_job.yaml — Azure ML CLI v2 job definition
$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json
type: command
display_name: churn-model-training
experiment_name: churn-classification
description: Train churn classifier with MLflow tracking
code: ../src
command: >-
python train.py
--train_data ${{inputs.train_data}}
--n_estimators 200
--learning_rate 0.05
--max_depth 4
environment: azureml:sklearn-mlflow-env:1
compute: azureml:cpu-cluster
inputs:
train_data:
type: uri_folder
path: azureml:churn-train:1
resources:
instance_count: 1
TypeScript Client
// lib/azure-ml/client.ts — invoke Azure ML Online Endpoint
const ENDPOINT_URL = process.env.AZURE_ML_ENDPOINT_URL ?? ""
const API_KEY = process.env.AZURE_ML_API_KEY ?? ""
export type ChurnInput = {
age: number
tenure_days: number
monthly_spend: number
support_tickets: number
last_login_days: number
}
export type ChurnPrediction = {
churn_probability: number
risk_tier: string
}
export async function predictChurn(records: ChurnInput[]): Promise<ChurnPrediction[]> {
const res = await fetch(ENDPOINT_URL, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${API_KEY}`,
},
body: JSON.stringify({
input_data: {
columns: ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"],
data: records.map(r => [r.age, r.tenure_days, r.monthly_spend, r.support_tickets, r.last_login_days]),
},
}),
})
if (!res.ok) throw new Error(`Azure ML ${res.status}: ${await res.text()}`)
const predictions: number[] = await res.json()
return predictions.map(prob => ({
churn_probability: Math.round(prob * 10000) / 10000,
risk_tier: prob > 0.7 ? "HIGH" : prob > 0.3 ? "MEDIUM" : "LOW",
}))
}
For the Vertex AI alternative when your data and infrastructure live on Google Cloud with BigQuery as the data warehouse — Vertex AI’s AutoML and BigQuery ML integration are natural while Azure ML is the right choice for Microsoft-ecosystem teams with Azure Data Factory pipelines, Azure Blob Storage data lakes, Active Directory authentication, and existing Azure Databricks or Synapse Analytics deployments. For the SageMaker alternative when your workloads are AWS-native with S3 data lakes and existing IAM — SageMaker’s deep AWS service integration contrasts with Azure ML’s enterprise Active Directory SSO and Azure DevOps CI/CD integration that makes it natural for organizations already invested in the Microsoft cloud stack. The Claude Skills 360 bundle includes Azure ML skill sets covering command jobs, environment management, online endpoint deployment, pipeline YAML specs, and TypeScript prediction clients. Start with the free tier to try Azure ML workflow generation.