Vertex AI is Google Cloud’s ML platform — unified training, tuning, and serving. pip install google-cloud-aiplatform. aiplatform.init(project="my-project", location="us-central1"). Custom training: job = aiplatform.CustomTrainingJob(display_name="churn-train", script_path="train.py", container_uri="us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-2:latest"). model = job.run(replica_count=1, machine_type="n1-standard-4", args=["--epochs=10"]). Pre-built containers for sklearn, XGBoost, TensorFlow, PyTorch, and HuggingFace. model.upload for pre-trained models: aiplatform.Model.upload(display_name="churn-v1", artifact_uri="gs://bucket/model/", serving_container_image_uri="..."). Endpoints: endpoint = aiplatform.Endpoint.create(display_name="churn-endpoint"), model.deploy(endpoint=endpoint, machine_type="n1-standard-2", min_replica_count=1, max_replica_count=5). endpoint.predict(instances=[[1.0, 2.0, 3.0]]). Batch Prediction: model.batch_predict(job_display_name="batch-score", gcs_source="gs://bucket/input/*.jsonl", gcs_destination_prefix="gs://bucket/output/", machine_type="n1-standard-4"). Vertex Pipelines: @component and @pipeline decorators from kfp.dsl, compiler.Compiler().compile(pipeline_func, "pipeline.yaml"), aiplatform.PipelineJob(template_path="pipeline.yaml").submit(). Feature Store: FeatureStore.create(...), EntityType.create(...), Feature.create(...), entity_type.ingest_from_df(df, feature_time="timestamp_col"), entity_type.read(entity_ids=["user_1"]). Model Garden: aiplatform.PublisherModel(model_name="publishers/google/models/gemini-1.5-flash-001"). Claude Code generates Vertex AI training jobs, pipeline components, endpoint deployments, feature store setups, and TypeScript prediction clients.
CLAUDE.md for Vertex AI
## Vertex AI Stack
- SDK: google-cloud-aiplatform >= 1.50
- Init: aiplatform.init(project=GCP_PROJECT, location="us-central1")
- Training: CustomTrainingJob(script_path, container_uri).run(machine_type, args)
- Upload: aiplatform.Model.upload(artifact_uri="gs://", serving_container_image_uri)
- Endpoint: model.deploy(endpoint, machine_type, min_replica_count, max_replica_count)
- Batch: model.batch_predict(gcs_source, gcs_destination_prefix, machine_type)
- Pipelines: @kfp.dsl.component + @kfp.dsl.pipeline → PipelineJob.submit()
- Features: FeatureStore → EntityType → Feature → ingest_from_df / read
Training Script (GCS artifacts)
# train.py — Vertex AI custom training entry point
from __future__ import annotations
import argparse
import json
import os
import pickle
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from google.cloud import storage
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser()
parser.add_argument("--n-estimators", type=int, default=200)
parser.add_argument("--learning-rate", type=float, default=0.05)
parser.add_argument("--max-depth", type=int, default=4)
parser.add_argument("--train-data", type=str, required=True) # gs:// path
parser.add_argument("--test-data", type=str, default="")
# AIP_MODEL_DIR set by Vertex AI for artifact output
parser.add_argument("--model-dir", type=str, default=os.environ.get("AIP_MODEL_DIR", "/tmp/model"))
return parser.parse_args()
def read_gcs_csv(gcs_path: str) -> pd.DataFrame:
"""Read CSV from GCS, returning a DataFrame."""
return pd.read_csv(gcs_path)
def upload_to_gcs(local_path: str, gcs_uri: str) -> None:
"""Upload a local file to GCS."""
gcs_uri = gcs_uri.replace("gs://", "")
bucket_name, *blob_parts = gcs_uri.split("/")
blob_name = "/".join(blob_parts)
storage.Client().bucket(bucket_name).blob(blob_name).upload_from_filename(local_path)
def main():
args = parse_args()
feature_cols = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
target_col = "churned"
train_df = read_gcs_csv(args.train_data)
test_df = read_gcs_csv(args.test_data) if args.test_data else train_df.sample(frac=0.2)
X_train, y_train = train_df[feature_cols].values, train_df[target_col].values
X_test, y_test = test_df[feature_cols].values, test_df[target_col].values
pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", GradientBoostingClassifier(
n_estimators=args.n_estimators,
learning_rate=args.learning_rate,
max_depth=args.max_depth,
random_state=42,
)),
])
pipeline.fit(X_train, y_train)
auc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
print(f"AUC-ROC: {auc:.4f}")
# Vertex AI expects model artifacts in AIP_MODEL_DIR
os.makedirs(args.model_dir, exist_ok=True)
model_path = os.path.join(args.model_dir, "model.pkl")
with open(model_path, "wb") as f:
pickle.dump(pipeline, f)
# Write metadata for Model Evaluation
metadata = {"auc": auc, "framework": "sklearn", "features": feature_cols}
with open(os.path.join(args.model_dir, "metadata.json"), "w") as f:
json.dump(metadata, f)
print(f"Model saved to {args.model_dir}")
if __name__ == "__main__":
main()
Vertex AI SDK Workflow
# ml/vertex_workflow.py — training, deployment, and batch prediction
from __future__ import annotations
from google.cloud import aiplatform
from kfp import dsl, compiler
from kfp.dsl import component, Output, Model, Metrics
PROJECT = "my-gcp-project"
LOCATION = "us-central1"
BUCKET = "gs://my-ml-bucket"
REPO = f"us-central1-docker.pkg.dev/{PROJECT}/ml-images"
def init_vertex():
aiplatform.init(project=PROJECT, location=LOCATION, staging_bucket=BUCKET)
# ── Training ────────────────────────────────────────────────────────────────
def run_training_job(
train_data: str = f"{BUCKET}/data/train.csv",
test_data: str = f"{BUCKET}/data/test.csv",
) -> aiplatform.Model:
init_vertex()
job = aiplatform.CustomTrainingJob(
display_name="churn-model-training",
script_path="train.py",
container_uri="us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-2:latest",
requirements=["google-cloud-storage>=2.0"],
model_serving_container_image_uri="us-docker.pkg.dev/vertex-ai/prediction/sklearn-cpu.1-2:latest",
)
model = job.run(
model_display_name="churn-classifier",
replica_count=1,
machine_type="n1-standard-4",
args=[
f"--train-data={train_data}",
f"--test-data={test_data}",
"--n-estimators=200",
"--learning-rate=0.05",
],
base_output_dir=f"{BUCKET}/training-output",
sync=True,
)
print(f"Model trained: {model.resource_name}")
return model
# ── Endpoint deployment ──────────────────────────────────────────────────────
def deploy_endpoint(
model: aiplatform.Model,
endpoint_name: str = "churn-prod",
machine_type: str = "n1-standard-2",
) -> aiplatform.Endpoint:
init_vertex()
endpoint = aiplatform.Endpoint.create(
display_name=endpoint_name,
labels={"env": "production", "team": "ml"},
)
model.deploy(
endpoint=endpoint,
deployed_model_display_name=f"{endpoint_name}-v1",
machine_type=machine_type,
min_replica_count=1,
max_replica_count=5,
traffic_split={"0": 100},
accelerator_type=None,
explanation_metadata=None,
explanation_parameters=None,
)
print(f"Deployed to endpoint: {endpoint.resource_name}")
return endpoint
# ── Batch prediction ─────────────────────────────────────────────────────────
def run_batch_prediction(
model: aiplatform.Model,
gcs_src: str = f"{BUCKET}/batch-input/*.jsonl",
gcs_dest: str = f"{BUCKET}/batch-output/",
) -> aiplatform.BatchPredictionJob:
init_vertex()
job = model.batch_predict(
job_display_name="churn-batch-score",
gcs_source=gcs_src,
gcs_destination_prefix=gcs_dest,
instances_format="jsonl",
predictions_format="jsonl",
machine_type="n1-standard-4",
starting_replica_count=2,
max_replica_count=10,
sync=True,
)
print(f"Batch job complete: {job.resource_name}")
return job
# ── Vertex Pipelines ─────────────────────────────────────────────────────────
@component(
base_image="us-docker.pkg.dev/vertex-ai/training/sklearn-cpu.1-2:latest",
packages_to_install=["google-cloud-aiplatform>=1.50"],
)
def train_component(
project: str,
location: str,
train_data: str,
auc_threshold: float,
model: Output[Model],
metrics: Output[Metrics],
):
"""Kubeflow pipeline component that trains and writes a model artifact."""
from google.cloud import aiplatform as aip
import pickle, json, os
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import pandas as pd
aip.init(project=project, location=location)
df = pd.read_csv(train_data)
feature_cols = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
pipe = Pipeline([("sc", StandardScaler()), ("clf", GradientBoostingClassifier())])
pipe.fit(df[feature_cols], df["churned"])
auc = roc_auc_score(df["churned"], pipe.predict_proba(df[feature_cols])[:, 1])
os.makedirs(model.path, exist_ok=True)
with open(os.path.join(model.path, "model.pkl"), "wb") as f:
pickle.dump(pipe, f)
metrics.log_metric("auc", auc)
print(f"Component AUC: {auc:.4f}")
@dsl.pipeline(name="churn-training-pipeline", description="Train and register churn model")
def churn_pipeline(
project: str = PROJECT,
location: str = LOCATION,
train_data: str = f"{BUCKET}/data/train.csv",
auc_threshold: float = 0.75,
):
train_task = train_component(
project=project,
location=location,
train_data=train_data,
auc_threshold=auc_threshold,
)
train_task.set_cpu_limit("4").set_memory_limit("16G")
def compile_and_run_pipeline() -> None:
init_vertex()
compiler.Compiler().compile(churn_pipeline, "churn_pipeline.yaml")
pipeline_job = aiplatform.PipelineJob(
display_name="churn-pipeline-run",
template_path="churn_pipeline.yaml",
parameter_values={
"project": PROJECT,
"location": LOCATION,
"train_data": f"{BUCKET}/data/train.csv",
},
)
pipeline_job.submit(service_account=f"vertex-sa@{PROJECT}.iam.gserviceaccount.com")
print(f"Pipeline submitted: {pipeline_job.resource_name}")
TypeScript Client
// lib/vertex/client.ts — TypeScript client for Vertex AI endpoints
import { PredictionServiceClient } from "@google-cloud/aiplatform"
import { helpers } from "@google-cloud/aiplatform"
const PROJECT = process.env.GCP_PROJECT ?? "my-gcp-project"
const LOCATION = process.env.GCP_LOCATION ?? "us-central1"
const predictionClient = new PredictionServiceClient({
apiEndpoint: `${LOCATION}-aiplatform.googleapis.com`,
})
export type ChurnInput = {
age: number
tenure_days: number
monthly_spend: number
support_tickets: number
last_login_days: number
}
export type ChurnPrediction = {
churn_probability: number
risk_tier: "HIGH" | "MEDIUM" | "LOW"
}
export async function predictChurn(
endpointId: string,
records: ChurnInput[],
): Promise<ChurnPrediction[]> {
const endpoint = `projects/${PROJECT}/locations/${LOCATION}/endpoints/${endpointId}`
const instances = records.map(r =>
helpers.toValue({
instances: [[r.age, r.tenure_days, r.monthly_spend, r.support_tickets, r.last_login_days]],
})
)
const [response] = await predictionClient.predict({
endpoint,
instances,
})
return (response.predictions ?? []).map((pred: any) => {
const probs = helpers.fromValue(pred) as number[]
const prob = Array.isArray(probs) ? probs[1] ?? probs[0] : 0
return {
churn_probability: Math.round(prob * 10000) / 10000,
risk_tier: prob > 0.7 ? "HIGH" : prob > 0.3 ? "MEDIUM" : "LOW",
}
})
}
For the SageMaker alternative when your infrastructure is AWS-native with data in S3, existing IAM roles, and VPC configurations — SageMaker integrates with Glue, Athena, and Redshift while Vertex AI is the natural choice for GCP-native teams with BigQuery data warehouses and existing Cloud Storage and IAM configurations. For the Kubeflow (self-managed) alternative when needing full control over the Kubeflow infrastructure running inside your own Kubernetes cluster without Google Cloud dependency — self-managed Kubeflow gives complete portability while Vertex AI Pipelines manages the orchestration layer as a fully-managed service, eliminating cluster maintenance overhead. The Claude Skills 360 bundle includes Vertex AI skill sets covering custom training jobs, endpoint deployment, batch prediction, Kubeflow pipeline components, and TypeScript prediction clients. Start with the free tier to try GCP ML workflow generation.