The gcp:dataproc/cluster:Cluster resource, part of the Pulumi GCP provider, provisions a Cloud Dataproc cluster: its compute nodes, storage, networking, and runtime configuration. This guide focuses on three capabilities: basic cluster creation, machine type and disk configuration, and GPU accelerator attachment.
Dataproc clusters run in a GCP project and region, and may reference service accounts, GCS staging buckets, and initialization scripts. The examples are intentionally small. Combine them with your own IAM roles, networking, and storage infrastructure.
Create a minimal cluster with defaults
Most deployments start with a minimal configuration to run Spark or Hadoop jobs, relying on GCP defaults for machine types and worker counts.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const simplecluster = new gcp.dataproc.Cluster("simplecluster", {
name: "simplecluster",
region: "us-central1",
});
import pulumi
import pulumi_gcp as gcp
simplecluster = gcp.dataproc.Cluster("simplecluster",
name="simplecluster",
region="us-central1")
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewCluster(ctx, "simplecluster", &dataproc.ClusterArgs{
Name: pulumi.String("simplecluster"),
Region: pulumi.String("us-central1"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var simplecluster = new Gcp.Dataproc.Cluster("simplecluster", new()
{
Name = "simplecluster",
Region = "us-central1",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var simplecluster = new Cluster("simplecluster", ClusterArgs.builder()
.name("simplecluster")
.region("us-central1")
.build());
}
}
resources:
simplecluster:
type: gcp:dataproc:Cluster
properties:
name: simplecluster
region: us-central1
The name property sets a unique cluster identifier within the project and region. The region property determines where compute nodes run. Without explicit clusterConfig, GCP uses default machine types, disk sizes, and worker counts.
Configure machine types, disks, and initialization scripts
Production clusters require explicit control over compute resources and startup behavior. You specify machine types, disk configurations, and initialization actions to install monitoring agents or custom software.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const _default = new gcp.serviceaccount.Account("default", {
accountId: "service-account-id",
displayName: "Service Account",
});
const mycluster = new gcp.dataproc.Cluster("mycluster", {
name: "mycluster",
region: "us-central1",
gracefulDecommissionTimeout: "120s",
labels: {
foo: "bar",
},
clusterConfig: {
stagingBucket: "dataproc-staging-bucket",
clusterTier: "CLUSTER_TIER_STANDARD",
masterConfig: {
numInstances: 1,
machineType: "e2-medium",
diskConfig: {
bootDiskType: "pd-ssd",
bootDiskSizeGb: 30,
},
},
workerConfig: {
numInstances: 2,
machineType: "e2-medium",
minCpuPlatform: "Intel Skylake",
diskConfig: {
bootDiskSizeGb: 30,
numLocalSsds: 1,
},
},
preemptibleWorkerConfig: {
numInstances: 0,
},
softwareConfig: {
imageVersion: "2.0.35-debian10",
overrideProperties: {
"dataproc:dataproc.allow.zero.workers": "true",
},
},
gceClusterConfig: {
tags: [
"foo",
"bar",
],
serviceAccount: _default.email,
serviceAccountScopes: ["cloud-platform"],
},
initializationActions: [{
script: "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
timeoutSec: 500,
}],
},
});
import pulumi
import pulumi_gcp as gcp
default = gcp.serviceaccount.Account("default",
account_id="service-account-id",
display_name="Service Account")
mycluster = gcp.dataproc.Cluster("mycluster",
name="mycluster",
region="us-central1",
graceful_decommission_timeout="120s",
labels={
"foo": "bar",
},
cluster_config={
"staging_bucket": "dataproc-staging-bucket",
"cluster_tier": "CLUSTER_TIER_STANDARD",
"master_config": {
"num_instances": 1,
"machine_type": "e2-medium",
"disk_config": {
"boot_disk_type": "pd-ssd",
"boot_disk_size_gb": 30,
},
},
"worker_config": {
"num_instances": 2,
"machine_type": "e2-medium",
"min_cpu_platform": "Intel Skylake",
"disk_config": {
"boot_disk_size_gb": 30,
"num_local_ssds": 1,
},
},
"preemptible_worker_config": {
"num_instances": 0,
},
"software_config": {
"image_version": "2.0.35-debian10",
"override_properties": {
"dataproc:dataproc.allow.zero.workers": "true",
},
},
"gce_cluster_config": {
"tags": [
"foo",
"bar",
],
"service_account": default.email,
"service_account_scopes": ["cloud-platform"],
},
"initialization_actions": [{
"script": "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
"timeout_sec": 500,
}],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/serviceaccount"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_default, err := serviceaccount.NewAccount(ctx, "default", &serviceaccount.AccountArgs{
AccountId: pulumi.String("service-account-id"),
DisplayName: pulumi.String("Service Account"),
})
if err != nil {
return err
}
_, err = dataproc.NewCluster(ctx, "mycluster", &dataproc.ClusterArgs{
Name: pulumi.String("mycluster"),
Region: pulumi.String("us-central1"),
GracefulDecommissionTimeout: pulumi.String("120s"),
Labels: pulumi.StringMap{
"foo": pulumi.String("bar"),
},
ClusterConfig: &dataproc.ClusterClusterConfigArgs{
StagingBucket: pulumi.String("dataproc-staging-bucket"),
ClusterTier: pulumi.String("CLUSTER_TIER_STANDARD"),
MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
NumInstances: pulumi.Int(1),
MachineType: pulumi.String("e2-medium"),
DiskConfig: &dataproc.ClusterClusterConfigMasterConfigDiskConfigArgs{
BootDiskType: pulumi.String("pd-ssd"),
BootDiskSizeGb: pulumi.Int(30),
},
},
WorkerConfig: &dataproc.ClusterClusterConfigWorkerConfigArgs{
NumInstances: pulumi.Int(2),
MachineType: pulumi.String("e2-medium"),
MinCpuPlatform: pulumi.String("Intel Skylake"),
DiskConfig: &dataproc.ClusterClusterConfigWorkerConfigDiskConfigArgs{
BootDiskSizeGb: pulumi.Int(30),
NumLocalSsds: pulumi.Int(1),
},
},
PreemptibleWorkerConfig: &dataproc.ClusterClusterConfigPreemptibleWorkerConfigArgs{
NumInstances: pulumi.Int(0),
},
SoftwareConfig: &dataproc.ClusterClusterConfigSoftwareConfigArgs{
ImageVersion: pulumi.String("2.0.35-debian10"),
OverrideProperties: pulumi.StringMap{
"dataproc:dataproc.allow.zero.workers": pulumi.String("true"),
},
},
GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
Tags: pulumi.StringArray{
pulumi.String("foo"),
pulumi.String("bar"),
},
ServiceAccount: _default.Email,
ServiceAccountScopes: pulumi.StringArray{
pulumi.String("cloud-platform"),
},
},
InitializationActions: dataproc.ClusterClusterConfigInitializationActionArray{
&dataproc.ClusterClusterConfigInitializationActionArgs{
Script: pulumi.String("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh"),
TimeoutSec: pulumi.Int(500),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var @default = new Gcp.ServiceAccount.Account("default", new()
{
AccountId = "service-account-id",
DisplayName = "Service Account",
});
var mycluster = new Gcp.Dataproc.Cluster("mycluster", new()
{
Name = "mycluster",
Region = "us-central1",
GracefulDecommissionTimeout = "120s",
Labels =
{
{ "foo", "bar" },
},
ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
{
StagingBucket = "dataproc-staging-bucket",
ClusterTier = "CLUSTER_TIER_STANDARD",
MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
{
NumInstances = 1,
MachineType = "e2-medium",
DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigDiskConfigArgs
{
BootDiskType = "pd-ssd",
BootDiskSizeGb = 30,
},
},
WorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigArgs
{
NumInstances = 2,
MachineType = "e2-medium",
MinCpuPlatform = "Intel Skylake",
DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs
{
BootDiskSizeGb = 30,
NumLocalSsds = 1,
},
},
PreemptibleWorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs
{
NumInstances = 0,
},
SoftwareConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigSoftwareConfigArgs
{
ImageVersion = "2.0.35-debian10",
OverrideProperties =
{
{ "dataproc:dataproc.allow.zero.workers", "true" },
},
},
GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
{
Tags = new[]
{
"foo",
"bar",
},
ServiceAccount = @default.Email,
ServiceAccountScopes = new[]
{
"cloud-platform",
},
},
InitializationActions = new[]
{
new Gcp.Dataproc.Inputs.ClusterClusterConfigInitializationActionArgs
{
Script = "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
TimeoutSec = 500,
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.serviceaccount.Account;
import com.pulumi.gcp.serviceaccount.AccountArgs;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigSoftwareConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var default_ = new Account("default", AccountArgs.builder()
.accountId("service-account-id")
.displayName("Service Account")
.build());
var mycluster = new Cluster("mycluster", ClusterArgs.builder()
.name("mycluster")
.region("us-central1")
.gracefulDecommissionTimeout("120s")
.labels(Map.of("foo", "bar"))
.clusterConfig(ClusterClusterConfigArgs.builder()
.stagingBucket("dataproc-staging-bucket")
.clusterTier("CLUSTER_TIER_STANDARD")
.masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
.numInstances(1)
.machineType("e2-medium")
.diskConfig(ClusterClusterConfigMasterConfigDiskConfigArgs.builder()
.bootDiskType("pd-ssd")
.bootDiskSizeGb(30)
.build())
.build())
.workerConfig(ClusterClusterConfigWorkerConfigArgs.builder()
.numInstances(2)
.machineType("e2-medium")
.minCpuPlatform("Intel Skylake")
.diskConfig(ClusterClusterConfigWorkerConfigDiskConfigArgs.builder()
.bootDiskSizeGb(30)
.numLocalSsds(1)
.build())
.build())
.preemptibleWorkerConfig(ClusterClusterConfigPreemptibleWorkerConfigArgs.builder()
.numInstances(0)
.build())
.softwareConfig(ClusterClusterConfigSoftwareConfigArgs.builder()
.imageVersion("2.0.35-debian10")
.overrideProperties(Map.of("dataproc:dataproc.allow.zero.workers", "true"))
.build())
.gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
.tags(
"foo",
"bar")
.serviceAccount(default_.email())
.serviceAccountScopes("cloud-platform")
.build())
.initializationActions(ClusterClusterConfigInitializationActionArgs.builder()
.script("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh")
.timeoutSec(500)
.build())
.build())
.build());
}
}
resources:
default:
type: gcp:serviceaccount:Account
properties:
accountId: service-account-id
displayName: Service Account
mycluster:
type: gcp:dataproc:Cluster
properties:
name: mycluster
region: us-central1
gracefulDecommissionTimeout: 120s
labels:
foo: bar
clusterConfig:
stagingBucket: dataproc-staging-bucket
clusterTier: CLUSTER_TIER_STANDARD
masterConfig:
numInstances: 1
machineType: e2-medium
diskConfig:
bootDiskType: pd-ssd
bootDiskSizeGb: 30
workerConfig:
numInstances: 2
machineType: e2-medium
minCpuPlatform: Intel Skylake
diskConfig:
bootDiskSizeGb: 30
numLocalSsds: 1
preemptibleWorkerConfig:
numInstances: 0
softwareConfig:
imageVersion: 2.0.35-debian10
overrideProperties:
dataproc:dataproc.allow.zero.workers: 'true'
gceClusterConfig:
tags:
- foo
- bar
serviceAccount: ${default.email}
serviceAccountScopes:
- cloud-platform
initializationActions:
- script: gs://dataproc-initialization-actions/stackdriver/stackdriver.sh
timeoutSec: 500
The clusterConfig block defines the cluster topology. The masterConfig and workerConfig blocks set machine types and disk configurations for each node role. The initializationActions array runs scripts from GCS at cluster startup, enabling you to install Stackdriver agents or custom dependencies. The gceClusterConfig block attaches a service account for GCP API access and applies network tags for firewall rules.
Attach GPU accelerators for ML workloads
Machine learning workloads running on Dataproc often require GPU acceleration for training or inference.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const acceleratedCluster = new gcp.dataproc.Cluster("accelerated_cluster", {
name: "my-cluster-with-gpu",
region: "us-central1",
clusterConfig: {
gceClusterConfig: {
zone: "us-central1-a",
},
masterConfig: {
accelerators: [{
acceleratorType: "nvidia-tesla-k80",
acceleratorCount: 1,
}],
},
},
});
import pulumi
import pulumi_gcp as gcp
accelerated_cluster = gcp.dataproc.Cluster("accelerated_cluster",
name="my-cluster-with-gpu",
region="us-central1",
cluster_config={
"gce_cluster_config": {
"zone": "us-central1-a",
},
"master_config": {
"accelerators": [{
"accelerator_type": "nvidia-tesla-k80",
"accelerator_count": 1,
}],
},
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewCluster(ctx, "accelerated_cluster", &dataproc.ClusterArgs{
Name: pulumi.String("my-cluster-with-gpu"),
Region: pulumi.String("us-central1"),
ClusterConfig: &dataproc.ClusterClusterConfigArgs{
GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
Zone: pulumi.String("us-central1-a"),
},
MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
Accelerators: dataproc.ClusterClusterConfigMasterConfigAcceleratorArray{
&dataproc.ClusterClusterConfigMasterConfigAcceleratorArgs{
AcceleratorType: pulumi.String("nvidia-tesla-k80"),
AcceleratorCount: pulumi.Int(1),
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var acceleratedCluster = new Gcp.Dataproc.Cluster("accelerated_cluster", new()
{
Name = "my-cluster-with-gpu",
Region = "us-central1",
ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
{
GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
{
Zone = "us-central1-a",
},
MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
{
Accelerators = new[]
{
new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigAcceleratorArgs
{
AcceleratorType = "nvidia-tesla-k80",
AcceleratorCount = 1,
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var acceleratedCluster = new Cluster("acceleratedCluster", ClusterArgs.builder()
.name("my-cluster-with-gpu")
.region("us-central1")
.clusterConfig(ClusterClusterConfigArgs.builder()
.gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
.zone("us-central1-a")
.build())
.masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
.accelerators(ClusterClusterConfigMasterConfigAcceleratorArgs.builder()
.acceleratorType("nvidia-tesla-k80")
.acceleratorCount(1)
.build())
.build())
.build())
.build());
}
}
resources:
acceleratedCluster:
type: gcp:dataproc:Cluster
name: accelerated_cluster
properties:
name: my-cluster-with-gpu
region: us-central1
clusterConfig:
gceClusterConfig:
zone: us-central1-a
masterConfig:
accelerators:
- acceleratorType: nvidia-tesla-k80
acceleratorCount: '1'
The accelerators block within masterConfig or workerConfig attaches GPUs to nodes. The acceleratorType specifies the GPU model (e.g., nvidia-tesla-k80), and acceleratorCount sets how many GPUs per node. The zone property must specify a zone that supports the chosen accelerator type.
Beyond these examples
These snippets focus on specific cluster-level features: cluster sizing and machine types, disk configuration and initialization scripts, and GPU accelerators for ML workloads. They’re intentionally minimal rather than full data processing environments.
The examples may reference pre-existing infrastructure such as GCS staging buckets, service accounts with appropriate IAM roles, and initialization scripts in GCS. They focus on configuring the cluster rather than provisioning everything around it.
To keep things focused, common cluster patterns are omitted, including:
- Preemptible workers for cost optimization (preemptibleWorkerConfig)
- Virtual clusters on GKE (virtualClusterConfig)
- Autoscaling policies (autoscalingConfig)
- Network and subnetwork configuration (gceClusterConfig.network)
- Encryption keys and security settings
- Graceful decommissioning timeouts (gracefulDecommissionTimeout)
These omissions are intentional: the goal is to illustrate how each cluster feature is wired, not provide drop-in data processing modules. See the Dataproc Cluster resource reference for all available configuration options.
Let's create GCP Dataproc Clusters
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Updates
labels, cluster_config.worker_config.num_instances, and cluster_config.preemptible_worker_config.num_instances. Changing any other property causes full cluster recreation due to API limitations.cluster_config.worker_config.min_num_instances are silently ignored by the Dataproc API. Use num_instances instead to scale worker nodes.gracefulDecommissionTimeout (e.g., “120s”) to allow graceful decommissioning when changing num_instances for workers or preemptible workers.Cluster Setup & Initialization
global region if not specified. The region is immutable after creation.stagingBucket in clusterConfig stores temporary files and job artifacts during cluster operations.initializationActions in clusterConfig with a script path (typically a GCS URI like “gs://bucket/script.sh”) and optional timeoutSec.serviceAccount (email) and serviceAccountScopes (e.g., [“cloud-platform”]) within gceClusterConfig.Compute Resources
accelerators within masterConfig or workerConfig, specifying acceleratorType (e.g., “nvidia-tesla-k80”) and acceleratorCount. You may also need to specify a zone in gceClusterConfig.dataproc:dataproc.allow.zero.workers to “true” in softwareConfig.overrideProperties and configure workerConfig.numInstances to 0.Using a different cloud?
Explore analytics guides for other cloud providers: