The gcp:dataproc/cluster:Cluster resource, part of the Pulumi GCP provider, provisions a Dataproc cluster: its master and worker nodes, machine types, disk configuration, and optional GPU accelerators. This guide focuses on three capabilities: basic cluster creation with defaults, custom machine types and disk sizing, and GPU accelerator attachment.
Dataproc clusters require a GCS staging bucket, may reference service accounts for permissions, and can run initialization scripts from Cloud Storage. The examples are intentionally small. Combine them with your own IAM roles, network configuration, and job submission logic.
Create a minimal cluster with default configuration
Most deployments start with a minimal cluster that uses GCP defaults for machine types and worker counts, providing a quick way to run Spark or Hadoop jobs.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const simplecluster = new gcp.dataproc.Cluster("simplecluster", {
name: "simplecluster",
region: "us-central1",
});
import pulumi
import pulumi_gcp as gcp
simplecluster = gcp.dataproc.Cluster("simplecluster",
name="simplecluster",
region="us-central1")
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewCluster(ctx, "simplecluster", &dataproc.ClusterArgs{
Name: pulumi.String("simplecluster"),
Region: pulumi.String("us-central1"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var simplecluster = new Gcp.Dataproc.Cluster("simplecluster", new()
{
Name = "simplecluster",
Region = "us-central1",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var simplecluster = new Cluster("simplecluster", ClusterArgs.builder()
.name("simplecluster")
.region("us-central1")
.build());
}
}
resources:
simplecluster:
type: gcp:dataproc:Cluster
properties:
name: simplecluster
region: us-central1
When you specify only name and region, GCP selects default machine types, disk sizes, and worker counts. The cluster uses the default VPC network and Compute Engine service account. This configuration works for development and testing; production clusters need explicit resource tuning.
Configure master and worker nodes with custom resources
Production workloads require explicit control over machine types, disk configuration, and worker counts to match performance requirements and cost constraints.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const _default = new gcp.serviceaccount.Account("default", {
accountId: "service-account-id",
displayName: "Service Account",
});
const mycluster = new gcp.dataproc.Cluster("mycluster", {
name: "mycluster",
region: "us-central1",
gracefulDecommissionTimeout: "120s",
labels: {
foo: "bar",
},
clusterConfig: {
stagingBucket: "dataproc-staging-bucket",
clusterTier: "CLUSTER_TIER_STANDARD",
masterConfig: {
numInstances: 1,
machineType: "e2-medium",
diskConfig: {
bootDiskType: "pd-ssd",
bootDiskSizeGb: 30,
},
},
workerConfig: {
numInstances: 2,
machineType: "e2-medium",
minCpuPlatform: "Intel Skylake",
diskConfig: {
bootDiskSizeGb: 30,
numLocalSsds: 1,
},
},
preemptibleWorkerConfig: {
numInstances: 0,
},
softwareConfig: {
imageVersion: "2.0.35-debian10",
overrideProperties: {
"dataproc:dataproc.allow.zero.workers": "true",
},
},
gceClusterConfig: {
tags: [
"foo",
"bar",
],
serviceAccount: _default.email,
serviceAccountScopes: ["cloud-platform"],
},
initializationActions: [{
script: "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
timeoutSec: 500,
}],
},
});
import pulumi
import pulumi_gcp as gcp
default = gcp.serviceaccount.Account("default",
account_id="service-account-id",
display_name="Service Account")
mycluster = gcp.dataproc.Cluster("mycluster",
name="mycluster",
region="us-central1",
graceful_decommission_timeout="120s",
labels={
"foo": "bar",
},
cluster_config={
"staging_bucket": "dataproc-staging-bucket",
"cluster_tier": "CLUSTER_TIER_STANDARD",
"master_config": {
"num_instances": 1,
"machine_type": "e2-medium",
"disk_config": {
"boot_disk_type": "pd-ssd",
"boot_disk_size_gb": 30,
},
},
"worker_config": {
"num_instances": 2,
"machine_type": "e2-medium",
"min_cpu_platform": "Intel Skylake",
"disk_config": {
"boot_disk_size_gb": 30,
"num_local_ssds": 1,
},
},
"preemptible_worker_config": {
"num_instances": 0,
},
"software_config": {
"image_version": "2.0.35-debian10",
"override_properties": {
"dataproc:dataproc.allow.zero.workers": "true",
},
},
"gce_cluster_config": {
"tags": [
"foo",
"bar",
],
"service_account": default.email,
"service_account_scopes": ["cloud-platform"],
},
"initialization_actions": [{
"script": "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
"timeout_sec": 500,
}],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/serviceaccount"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_default, err := serviceaccount.NewAccount(ctx, "default", &serviceaccount.AccountArgs{
AccountId: pulumi.String("service-account-id"),
DisplayName: pulumi.String("Service Account"),
})
if err != nil {
return err
}
_, err = dataproc.NewCluster(ctx, "mycluster", &dataproc.ClusterArgs{
Name: pulumi.String("mycluster"),
Region: pulumi.String("us-central1"),
GracefulDecommissionTimeout: pulumi.String("120s"),
Labels: pulumi.StringMap{
"foo": pulumi.String("bar"),
},
ClusterConfig: &dataproc.ClusterClusterConfigArgs{
StagingBucket: pulumi.String("dataproc-staging-bucket"),
ClusterTier: pulumi.String("CLUSTER_TIER_STANDARD"),
MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
NumInstances: pulumi.Int(1),
MachineType: pulumi.String("e2-medium"),
DiskConfig: &dataproc.ClusterClusterConfigMasterConfigDiskConfigArgs{
BootDiskType: pulumi.String("pd-ssd"),
BootDiskSizeGb: pulumi.Int(30),
},
},
WorkerConfig: &dataproc.ClusterClusterConfigWorkerConfigArgs{
NumInstances: pulumi.Int(2),
MachineType: pulumi.String("e2-medium"),
MinCpuPlatform: pulumi.String("Intel Skylake"),
DiskConfig: &dataproc.ClusterClusterConfigWorkerConfigDiskConfigArgs{
BootDiskSizeGb: pulumi.Int(30),
NumLocalSsds: pulumi.Int(1),
},
},
PreemptibleWorkerConfig: &dataproc.ClusterClusterConfigPreemptibleWorkerConfigArgs{
NumInstances: pulumi.Int(0),
},
SoftwareConfig: &dataproc.ClusterClusterConfigSoftwareConfigArgs{
ImageVersion: pulumi.String("2.0.35-debian10"),
OverrideProperties: pulumi.StringMap{
"dataproc:dataproc.allow.zero.workers": pulumi.String("true"),
},
},
GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
Tags: pulumi.StringArray{
pulumi.String("foo"),
pulumi.String("bar"),
},
ServiceAccount: _default.Email,
ServiceAccountScopes: pulumi.StringArray{
pulumi.String("cloud-platform"),
},
},
InitializationActions: dataproc.ClusterClusterConfigInitializationActionArray{
&dataproc.ClusterClusterConfigInitializationActionArgs{
Script: pulumi.String("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh"),
TimeoutSec: pulumi.Int(500),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var @default = new Gcp.ServiceAccount.Account("default", new()
{
AccountId = "service-account-id",
DisplayName = "Service Account",
});
var mycluster = new Gcp.Dataproc.Cluster("mycluster", new()
{
Name = "mycluster",
Region = "us-central1",
GracefulDecommissionTimeout = "120s",
Labels =
{
{ "foo", "bar" },
},
ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
{
StagingBucket = "dataproc-staging-bucket",
ClusterTier = "CLUSTER_TIER_STANDARD",
MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
{
NumInstances = 1,
MachineType = "e2-medium",
DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigDiskConfigArgs
{
BootDiskType = "pd-ssd",
BootDiskSizeGb = 30,
},
},
WorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigArgs
{
NumInstances = 2,
MachineType = "e2-medium",
MinCpuPlatform = "Intel Skylake",
DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs
{
BootDiskSizeGb = 30,
NumLocalSsds = 1,
},
},
PreemptibleWorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs
{
NumInstances = 0,
},
SoftwareConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigSoftwareConfigArgs
{
ImageVersion = "2.0.35-debian10",
OverrideProperties =
{
{ "dataproc:dataproc.allow.zero.workers", "true" },
},
},
GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
{
Tags = new[]
{
"foo",
"bar",
},
ServiceAccount = @default.Email,
ServiceAccountScopes = new[]
{
"cloud-platform",
},
},
InitializationActions = new[]
{
new Gcp.Dataproc.Inputs.ClusterClusterConfigInitializationActionArgs
{
Script = "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
TimeoutSec = 500,
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.serviceaccount.Account;
import com.pulumi.gcp.serviceaccount.AccountArgs;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigSoftwareConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var default_ = new Account("default", AccountArgs.builder()
.accountId("service-account-id")
.displayName("Service Account")
.build());
var mycluster = new Cluster("mycluster", ClusterArgs.builder()
.name("mycluster")
.region("us-central1")
.gracefulDecommissionTimeout("120s")
.labels(Map.of("foo", "bar"))
.clusterConfig(ClusterClusterConfigArgs.builder()
.stagingBucket("dataproc-staging-bucket")
.clusterTier("CLUSTER_TIER_STANDARD")
.masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
.numInstances(1)
.machineType("e2-medium")
.diskConfig(ClusterClusterConfigMasterConfigDiskConfigArgs.builder()
.bootDiskType("pd-ssd")
.bootDiskSizeGb(30)
.build())
.build())
.workerConfig(ClusterClusterConfigWorkerConfigArgs.builder()
.numInstances(2)
.machineType("e2-medium")
.minCpuPlatform("Intel Skylake")
.diskConfig(ClusterClusterConfigWorkerConfigDiskConfigArgs.builder()
.bootDiskSizeGb(30)
.numLocalSsds(1)
.build())
.build())
.preemptibleWorkerConfig(ClusterClusterConfigPreemptibleWorkerConfigArgs.builder()
.numInstances(0)
.build())
.softwareConfig(ClusterClusterConfigSoftwareConfigArgs.builder()
.imageVersion("2.0.35-debian10")
.overrideProperties(Map.of("dataproc:dataproc.allow.zero.workers", "true"))
.build())
.gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
.tags(
"foo",
"bar")
.serviceAccount(default_.email())
.serviceAccountScopes("cloud-platform")
.build())
.initializationActions(ClusterClusterConfigInitializationActionArgs.builder()
.script("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh")
.timeoutSec(500)
.build())
.build())
.build());
}
}
resources:
default:
type: gcp:serviceaccount:Account
properties:
accountId: service-account-id
displayName: Service Account
mycluster:
type: gcp:dataproc:Cluster
properties:
name: mycluster
region: us-central1
gracefulDecommissionTimeout: 120s
labels:
foo: bar
clusterConfig:
stagingBucket: dataproc-staging-bucket
clusterTier: CLUSTER_TIER_STANDARD
masterConfig:
numInstances: 1
machineType: e2-medium
diskConfig:
bootDiskType: pd-ssd
bootDiskSizeGb: 30
workerConfig:
numInstances: 2
machineType: e2-medium
minCpuPlatform: Intel Skylake
diskConfig:
bootDiskSizeGb: 30
numLocalSsds: 1
preemptibleWorkerConfig:
numInstances: 0
softwareConfig:
imageVersion: 2.0.35-debian10
overrideProperties:
dataproc:dataproc.allow.zero.workers: 'true'
gceClusterConfig:
tags:
- foo
- bar
serviceAccount: ${default.email}
serviceAccountScopes:
- cloud-platform
initializationActions:
- script: gs://dataproc-initialization-actions/stackdriver/stackdriver.sh
timeoutSec: 500
The clusterConfig block defines cluster topology. The masterConfig and workerConfig blocks set numInstances, machineType, and diskConfig for each node type. The stagingBucket stores job dependencies and logs. The gceClusterConfig attaches a service account and network tags. Initialization actions run scripts from Cloud Storage when nodes start, useful for installing custom libraries or configuring monitoring.
Attach GPU accelerators for ML workloads
Machine learning pipelines running on Spark often need GPU acceleration for training or inference tasks.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const acceleratedCluster = new gcp.dataproc.Cluster("accelerated_cluster", {
name: "my-cluster-with-gpu",
region: "us-central1",
clusterConfig: {
gceClusterConfig: {
zone: "us-central1-a",
},
masterConfig: {
accelerators: [{
acceleratorType: "nvidia-tesla-k80",
acceleratorCount: 1,
}],
},
},
});
import pulumi
import pulumi_gcp as gcp
accelerated_cluster = gcp.dataproc.Cluster("accelerated_cluster",
name="my-cluster-with-gpu",
region="us-central1",
cluster_config={
"gce_cluster_config": {
"zone": "us-central1-a",
},
"master_config": {
"accelerators": [{
"accelerator_type": "nvidia-tesla-k80",
"accelerator_count": 1,
}],
},
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewCluster(ctx, "accelerated_cluster", &dataproc.ClusterArgs{
Name: pulumi.String("my-cluster-with-gpu"),
Region: pulumi.String("us-central1"),
ClusterConfig: &dataproc.ClusterClusterConfigArgs{
GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
Zone: pulumi.String("us-central1-a"),
},
MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
Accelerators: dataproc.ClusterClusterConfigMasterConfigAcceleratorArray{
&dataproc.ClusterClusterConfigMasterConfigAcceleratorArgs{
AcceleratorType: pulumi.String("nvidia-tesla-k80"),
AcceleratorCount: pulumi.Int(1),
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var acceleratedCluster = new Gcp.Dataproc.Cluster("accelerated_cluster", new()
{
Name = "my-cluster-with-gpu",
Region = "us-central1",
ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
{
GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
{
Zone = "us-central1-a",
},
MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
{
Accelerators = new[]
{
new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigAcceleratorArgs
{
AcceleratorType = "nvidia-tesla-k80",
AcceleratorCount = 1,
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var acceleratedCluster = new Cluster("acceleratedCluster", ClusterArgs.builder()
.name("my-cluster-with-gpu")
.region("us-central1")
.clusterConfig(ClusterClusterConfigArgs.builder()
.gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
.zone("us-central1-a")
.build())
.masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
.accelerators(ClusterClusterConfigMasterConfigAcceleratorArgs.builder()
.acceleratorType("nvidia-tesla-k80")
.acceleratorCount(1)
.build())
.build())
.build())
.build());
}
}
resources:
acceleratedCluster:
type: gcp:dataproc:Cluster
name: accelerated_cluster
properties:
name: my-cluster-with-gpu
region: us-central1
clusterConfig:
gceClusterConfig:
zone: us-central1-a
masterConfig:
accelerators:
- acceleratorType: nvidia-tesla-k80
acceleratorCount: '1'
The accelerators block attaches GPUs to nodes by specifying acceleratorType and acceleratorCount. GPU availability varies by zone, so you must set gceClusterConfig.zone explicitly rather than relying on region-level defaults. This configuration enables GPU-accelerated Spark jobs for ML workloads.
Beyond these examples
These snippets focus on specific cluster-level features: cluster sizing and machine types, disk configuration and local SSDs, and GPU accelerators and initialization actions. They’re intentionally minimal rather than full data processing pipelines.
The examples may reference pre-existing infrastructure such as GCS staging buckets, service accounts with appropriate IAM roles, and initialization scripts in Cloud Storage. They focus on configuring the cluster rather than provisioning everything around it.
To keep things focused, common cluster patterns are omitted, including:
- Virtual clusters on GKE (virtualClusterConfig)
- Preemptible workers for cost optimization
- Metastore integration (metastoreConfig)
- Encryption configuration (encryptionConfig)
- Autoscaling policies
- High availability (HA) master configuration
These omissions are intentional: the goal is to illustrate how each cluster feature is wired, not provide drop-in data processing modules. See the Dataproc Cluster resource reference for all available configuration options.
Let's create GCP Dataproc Clusters
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Updates
labels, clusterConfig.workerConfig.numInstances, and clusterConfig.preemptibleWorkerConfig.numInstances. All other property changes cause cluster recreation.name, region, machineType, diskConfig, or softwareConfig triggers cluster recreation.clusterConfig.workerConfig.numInstances or clusterConfig.preemptibleWorkerConfig.numInstances. Note that changes to minNumInstances are ignored.Cluster Setup & Initialization
clusterConfig.initializationActions with a script (GCS path like gs://bucket/script.sh) and timeoutSec value.clusterConfig.gceClusterConfig.serviceAccount to the service account email and serviceAccountScopes to the required scopes (e.g., ["cloud-platform"]).clusterConfig.preemptibleWorkerConfig.numInstances to 0 and add "dataproc:dataproc.allow.zero.workers": "true" to softwareConfig.overrideProperties.Compute Resources
clusterConfig.masterConfig.accelerators with acceleratorType (e.g., "nvidia-tesla-k80") and acceleratorCount.global region if not specified. The region property is immutable after creation.Import & Limitations
Using a different cloud?
Explore analytics guides for other cloud providers: