Create GCP Dataproc Clusters

The gcp:dataproc/cluster:Cluster resource, part of the Pulumi GCP provider, provisions a Cloud Dataproc cluster: its compute nodes, storage, networking, and runtime configuration. This guide focuses on three capabilities: basic cluster creation, machine type and disk configuration, and GPU accelerator attachment.

Dataproc clusters run in a GCP project and region, and may reference service accounts, GCS staging buckets, and initialization scripts. The examples are intentionally small. Combine them with your own IAM roles, networking, and storage infrastructure.

Create a minimal cluster with defaults

Most deployments start with a minimal configuration to run Spark or Hadoop jobs, relying on GCP defaults for machine types and worker counts.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const simplecluster = new gcp.dataproc.Cluster("simplecluster", {
    name: "simplecluster",
    region: "us-central1",
});
import pulumi
import pulumi_gcp as gcp

simplecluster = gcp.dataproc.Cluster("simplecluster",
    name="simplecluster",
    region="us-central1")
package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewCluster(ctx, "simplecluster", &dataproc.ClusterArgs{
			Name:   pulumi.String("simplecluster"),
			Region: pulumi.String("us-central1"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var simplecluster = new Gcp.Dataproc.Cluster("simplecluster", new()
    {
        Name = "simplecluster",
        Region = "us-central1",
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var simplecluster = new Cluster("simplecluster", ClusterArgs.builder()
            .name("simplecluster")
            .region("us-central1")
            .build());

    }
}
resources:
  simplecluster:
    type: gcp:dataproc:Cluster
    properties:
      name: simplecluster
      region: us-central1

The name property sets a unique cluster identifier within the project and region. The region property determines where compute nodes run. Without explicit clusterConfig, GCP uses default machine types, disk sizes, and worker counts.

Configure machine types, disks, and initialization scripts

Production clusters require explicit control over compute resources and startup behavior. You specify machine types, disk configurations, and initialization actions to install monitoring agents or custom software.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const _default = new gcp.serviceaccount.Account("default", {
    accountId: "service-account-id",
    displayName: "Service Account",
});
const mycluster = new gcp.dataproc.Cluster("mycluster", {
    name: "mycluster",
    region: "us-central1",
    gracefulDecommissionTimeout: "120s",
    labels: {
        foo: "bar",
    },
    clusterConfig: {
        stagingBucket: "dataproc-staging-bucket",
        clusterTier: "CLUSTER_TIER_STANDARD",
        masterConfig: {
            numInstances: 1,
            machineType: "e2-medium",
            diskConfig: {
                bootDiskType: "pd-ssd",
                bootDiskSizeGb: 30,
            },
        },
        workerConfig: {
            numInstances: 2,
            machineType: "e2-medium",
            minCpuPlatform: "Intel Skylake",
            diskConfig: {
                bootDiskSizeGb: 30,
                numLocalSsds: 1,
            },
        },
        preemptibleWorkerConfig: {
            numInstances: 0,
        },
        softwareConfig: {
            imageVersion: "2.0.35-debian10",
            overrideProperties: {
                "dataproc:dataproc.allow.zero.workers": "true",
            },
        },
        gceClusterConfig: {
            tags: [
                "foo",
                "bar",
            ],
            serviceAccount: _default.email,
            serviceAccountScopes: ["cloud-platform"],
        },
        initializationActions: [{
            script: "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
            timeoutSec: 500,
        }],
    },
});
import pulumi
import pulumi_gcp as gcp

default = gcp.serviceaccount.Account("default",
    account_id="service-account-id",
    display_name="Service Account")
mycluster = gcp.dataproc.Cluster("mycluster",
    name="mycluster",
    region="us-central1",
    graceful_decommission_timeout="120s",
    labels={
        "foo": "bar",
    },
    cluster_config={
        "staging_bucket": "dataproc-staging-bucket",
        "cluster_tier": "CLUSTER_TIER_STANDARD",
        "master_config": {
            "num_instances": 1,
            "machine_type": "e2-medium",
            "disk_config": {
                "boot_disk_type": "pd-ssd",
                "boot_disk_size_gb": 30,
            },
        },
        "worker_config": {
            "num_instances": 2,
            "machine_type": "e2-medium",
            "min_cpu_platform": "Intel Skylake",
            "disk_config": {
                "boot_disk_size_gb": 30,
                "num_local_ssds": 1,
            },
        },
        "preemptible_worker_config": {
            "num_instances": 0,
        },
        "software_config": {
            "image_version": "2.0.35-debian10",
            "override_properties": {
                "dataproc:dataproc.allow.zero.workers": "true",
            },
        },
        "gce_cluster_config": {
            "tags": [
                "foo",
                "bar",
            ],
            "service_account": default.email,
            "service_account_scopes": ["cloud-platform"],
        },
        "initialization_actions": [{
            "script": "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
            "timeout_sec": 500,
        }],
    })
package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/serviceaccount"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_default, err := serviceaccount.NewAccount(ctx, "default", &serviceaccount.AccountArgs{
			AccountId:   pulumi.String("service-account-id"),
			DisplayName: pulumi.String("Service Account"),
		})
		if err != nil {
			return err
		}
		_, err = dataproc.NewCluster(ctx, "mycluster", &dataproc.ClusterArgs{
			Name:                        pulumi.String("mycluster"),
			Region:                      pulumi.String("us-central1"),
			GracefulDecommissionTimeout: pulumi.String("120s"),
			Labels: pulumi.StringMap{
				"foo": pulumi.String("bar"),
			},
			ClusterConfig: &dataproc.ClusterClusterConfigArgs{
				StagingBucket: pulumi.String("dataproc-staging-bucket"),
				ClusterTier:   pulumi.String("CLUSTER_TIER_STANDARD"),
				MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
					NumInstances: pulumi.Int(1),
					MachineType:  pulumi.String("e2-medium"),
					DiskConfig: &dataproc.ClusterClusterConfigMasterConfigDiskConfigArgs{
						BootDiskType:   pulumi.String("pd-ssd"),
						BootDiskSizeGb: pulumi.Int(30),
					},
				},
				WorkerConfig: &dataproc.ClusterClusterConfigWorkerConfigArgs{
					NumInstances:   pulumi.Int(2),
					MachineType:    pulumi.String("e2-medium"),
					MinCpuPlatform: pulumi.String("Intel Skylake"),
					DiskConfig: &dataproc.ClusterClusterConfigWorkerConfigDiskConfigArgs{
						BootDiskSizeGb: pulumi.Int(30),
						NumLocalSsds:   pulumi.Int(1),
					},
				},
				PreemptibleWorkerConfig: &dataproc.ClusterClusterConfigPreemptibleWorkerConfigArgs{
					NumInstances: pulumi.Int(0),
				},
				SoftwareConfig: &dataproc.ClusterClusterConfigSoftwareConfigArgs{
					ImageVersion: pulumi.String("2.0.35-debian10"),
					OverrideProperties: pulumi.StringMap{
						"dataproc:dataproc.allow.zero.workers": pulumi.String("true"),
					},
				},
				GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
					Tags: pulumi.StringArray{
						pulumi.String("foo"),
						pulumi.String("bar"),
					},
					ServiceAccount: _default.Email,
					ServiceAccountScopes: pulumi.StringArray{
						pulumi.String("cloud-platform"),
					},
				},
				InitializationActions: dataproc.ClusterClusterConfigInitializationActionArray{
					&dataproc.ClusterClusterConfigInitializationActionArgs{
						Script:     pulumi.String("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh"),
						TimeoutSec: pulumi.Int(500),
					},
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var @default = new Gcp.ServiceAccount.Account("default", new()
    {
        AccountId = "service-account-id",
        DisplayName = "Service Account",
    });

    var mycluster = new Gcp.Dataproc.Cluster("mycluster", new()
    {
        Name = "mycluster",
        Region = "us-central1",
        GracefulDecommissionTimeout = "120s",
        Labels = 
        {
            { "foo", "bar" },
        },
        ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
        {
            StagingBucket = "dataproc-staging-bucket",
            ClusterTier = "CLUSTER_TIER_STANDARD",
            MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
            {
                NumInstances = 1,
                MachineType = "e2-medium",
                DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigDiskConfigArgs
                {
                    BootDiskType = "pd-ssd",
                    BootDiskSizeGb = 30,
                },
            },
            WorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigArgs
            {
                NumInstances = 2,
                MachineType = "e2-medium",
                MinCpuPlatform = "Intel Skylake",
                DiskConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs
                {
                    BootDiskSizeGb = 30,
                    NumLocalSsds = 1,
                },
            },
            PreemptibleWorkerConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs
            {
                NumInstances = 0,
            },
            SoftwareConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigSoftwareConfigArgs
            {
                ImageVersion = "2.0.35-debian10",
                OverrideProperties = 
                {
                    { "dataproc:dataproc.allow.zero.workers", "true" },
                },
            },
            GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
            {
                Tags = new[]
                {
                    "foo",
                    "bar",
                },
                ServiceAccount = @default.Email,
                ServiceAccountScopes = new[]
                {
                    "cloud-platform",
                },
            },
            InitializationActions = new[]
            {
                new Gcp.Dataproc.Inputs.ClusterClusterConfigInitializationActionArgs
                {
                    Script = "gs://dataproc-initialization-actions/stackdriver/stackdriver.sh",
                    TimeoutSec = 500,
                },
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.serviceaccount.Account;
import com.pulumi.gcp.serviceaccount.AccountArgs;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigWorkerConfigDiskConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigPreemptibleWorkerConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigSoftwareConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var default_ = new Account("default", AccountArgs.builder()
            .accountId("service-account-id")
            .displayName("Service Account")
            .build());

        var mycluster = new Cluster("mycluster", ClusterArgs.builder()
            .name("mycluster")
            .region("us-central1")
            .gracefulDecommissionTimeout("120s")
            .labels(Map.of("foo", "bar"))
            .clusterConfig(ClusterClusterConfigArgs.builder()
                .stagingBucket("dataproc-staging-bucket")
                .clusterTier("CLUSTER_TIER_STANDARD")
                .masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
                    .numInstances(1)
                    .machineType("e2-medium")
                    .diskConfig(ClusterClusterConfigMasterConfigDiskConfigArgs.builder()
                        .bootDiskType("pd-ssd")
                        .bootDiskSizeGb(30)
                        .build())
                    .build())
                .workerConfig(ClusterClusterConfigWorkerConfigArgs.builder()
                    .numInstances(2)
                    .machineType("e2-medium")
                    .minCpuPlatform("Intel Skylake")
                    .diskConfig(ClusterClusterConfigWorkerConfigDiskConfigArgs.builder()
                        .bootDiskSizeGb(30)
                        .numLocalSsds(1)
                        .build())
                    .build())
                .preemptibleWorkerConfig(ClusterClusterConfigPreemptibleWorkerConfigArgs.builder()
                    .numInstances(0)
                    .build())
                .softwareConfig(ClusterClusterConfigSoftwareConfigArgs.builder()
                    .imageVersion("2.0.35-debian10")
                    .overrideProperties(Map.of("dataproc:dataproc.allow.zero.workers", "true"))
                    .build())
                .gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
                    .tags(                    
                        "foo",
                        "bar")
                    .serviceAccount(default_.email())
                    .serviceAccountScopes("cloud-platform")
                    .build())
                .initializationActions(ClusterClusterConfigInitializationActionArgs.builder()
                    .script("gs://dataproc-initialization-actions/stackdriver/stackdriver.sh")
                    .timeoutSec(500)
                    .build())
                .build())
            .build());

    }
}
resources:
  default:
    type: gcp:serviceaccount:Account
    properties:
      accountId: service-account-id
      displayName: Service Account
  mycluster:
    type: gcp:dataproc:Cluster
    properties:
      name: mycluster
      region: us-central1
      gracefulDecommissionTimeout: 120s
      labels:
        foo: bar
      clusterConfig:
        stagingBucket: dataproc-staging-bucket
        clusterTier: CLUSTER_TIER_STANDARD
        masterConfig:
          numInstances: 1
          machineType: e2-medium
          diskConfig:
            bootDiskType: pd-ssd
            bootDiskSizeGb: 30
        workerConfig:
          numInstances: 2
          machineType: e2-medium
          minCpuPlatform: Intel Skylake
          diskConfig:
            bootDiskSizeGb: 30
            numLocalSsds: 1
        preemptibleWorkerConfig:
          numInstances: 0
        softwareConfig:
          imageVersion: 2.0.35-debian10
          overrideProperties:
            dataproc:dataproc.allow.zero.workers: 'true'
        gceClusterConfig:
          tags:
            - foo
            - bar
          serviceAccount: ${default.email}
          serviceAccountScopes:
            - cloud-platform
        initializationActions:
          - script: gs://dataproc-initialization-actions/stackdriver/stackdriver.sh
            timeoutSec: 500

The clusterConfig block defines the cluster topology. The masterConfig and workerConfig blocks set machine types and disk configurations for each node role. The initializationActions array runs scripts from GCS at cluster startup, enabling you to install Stackdriver agents or custom dependencies. The gceClusterConfig block attaches a service account for GCP API access and applies network tags for firewall rules.

Attach GPU accelerators for ML workloads

Machine learning workloads running on Dataproc often require GPU acceleration for training or inference.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const acceleratedCluster = new gcp.dataproc.Cluster("accelerated_cluster", {
    name: "my-cluster-with-gpu",
    region: "us-central1",
    clusterConfig: {
        gceClusterConfig: {
            zone: "us-central1-a",
        },
        masterConfig: {
            accelerators: [{
                acceleratorType: "nvidia-tesla-k80",
                acceleratorCount: 1,
            }],
        },
    },
});
import pulumi
import pulumi_gcp as gcp

accelerated_cluster = gcp.dataproc.Cluster("accelerated_cluster",
    name="my-cluster-with-gpu",
    region="us-central1",
    cluster_config={
        "gce_cluster_config": {
            "zone": "us-central1-a",
        },
        "master_config": {
            "accelerators": [{
                "accelerator_type": "nvidia-tesla-k80",
                "accelerator_count": 1,
            }],
        },
    })
package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewCluster(ctx, "accelerated_cluster", &dataproc.ClusterArgs{
			Name:   pulumi.String("my-cluster-with-gpu"),
			Region: pulumi.String("us-central1"),
			ClusterConfig: &dataproc.ClusterClusterConfigArgs{
				GceClusterConfig: &dataproc.ClusterClusterConfigGceClusterConfigArgs{
					Zone: pulumi.String("us-central1-a"),
				},
				MasterConfig: &dataproc.ClusterClusterConfigMasterConfigArgs{
					Accelerators: dataproc.ClusterClusterConfigMasterConfigAcceleratorArray{
						&dataproc.ClusterClusterConfigMasterConfigAcceleratorArgs{
							AcceleratorType:  pulumi.String("nvidia-tesla-k80"),
							AcceleratorCount: pulumi.Int(1),
						},
					},
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var acceleratedCluster = new Gcp.Dataproc.Cluster("accelerated_cluster", new()
    {
        Name = "my-cluster-with-gpu",
        Region = "us-central1",
        ClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigArgs
        {
            GceClusterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigGceClusterConfigArgs
            {
                Zone = "us-central1-a",
            },
            MasterConfig = new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigArgs
            {
                Accelerators = new[]
                {
                    new Gcp.Dataproc.Inputs.ClusterClusterConfigMasterConfigAcceleratorArgs
                    {
                        AcceleratorType = "nvidia-tesla-k80",
                        AcceleratorCount = 1,
                    },
                },
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Cluster;
import com.pulumi.gcp.dataproc.ClusterArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigGceClusterConfigArgs;
import com.pulumi.gcp.dataproc.inputs.ClusterClusterConfigMasterConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var acceleratedCluster = new Cluster("acceleratedCluster", ClusterArgs.builder()
            .name("my-cluster-with-gpu")
            .region("us-central1")
            .clusterConfig(ClusterClusterConfigArgs.builder()
                .gceClusterConfig(ClusterClusterConfigGceClusterConfigArgs.builder()
                    .zone("us-central1-a")
                    .build())
                .masterConfig(ClusterClusterConfigMasterConfigArgs.builder()
                    .accelerators(ClusterClusterConfigMasterConfigAcceleratorArgs.builder()
                        .acceleratorType("nvidia-tesla-k80")
                        .acceleratorCount(1)
                        .build())
                    .build())
                .build())
            .build());

    }
}
resources:
  acceleratedCluster:
    type: gcp:dataproc:Cluster
    name: accelerated_cluster
    properties:
      name: my-cluster-with-gpu
      region: us-central1
      clusterConfig:
        gceClusterConfig:
          zone: us-central1-a
        masterConfig:
          accelerators:
            - acceleratorType: nvidia-tesla-k80
              acceleratorCount: '1'

The accelerators block within masterConfig or workerConfig attaches GPUs to nodes. The acceleratorType specifies the GPU model (e.g., nvidia-tesla-k80), and acceleratorCount sets how many GPUs per node. The zone property must specify a zone that supports the chosen accelerator type.

Beyond these examples

These snippets focus on specific cluster-level features: cluster sizing and machine types, disk configuration and initialization scripts, and GPU accelerators for ML workloads. They’re intentionally minimal rather than full data processing environments.

The examples may reference pre-existing infrastructure such as GCS staging buckets, service accounts with appropriate IAM roles, and initialization scripts in GCS. They focus on configuring the cluster rather than provisioning everything around it.

To keep things focused, common cluster patterns are omitted, including:

  • Preemptible workers for cost optimization (preemptibleWorkerConfig)
  • Virtual clusters on GKE (virtualClusterConfig)
  • Autoscaling policies (autoscalingConfig)
  • Network and subnetwork configuration (gceClusterConfig.network)
  • Encryption keys and security settings
  • Graceful decommissioning timeouts (gracefulDecommissionTimeout)

These omissions are intentional: the goal is to illustrate how each cluster feature is wired, not provide drop-in data processing modules. See the Dataproc Cluster resource reference for all available configuration options.

Let's create GCP Dataproc Clusters

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Configuration & Updates
What properties can I change without recreating my cluster?
Only three properties are updatable: labels, cluster_config.worker_config.num_instances, and cluster_config.preemptible_worker_config.num_instances. Changing any other property causes full cluster recreation due to API limitations.
Why is my min_num_instances change being ignored?
Changes to cluster_config.worker_config.min_num_instances are silently ignored by the Dataproc API. Use num_instances instead to scale worker nodes.
How do I scale worker nodes without downtime?
Set gracefulDecommissionTimeout (e.g., “120s”) to allow graceful decommissioning when changing num_instances for workers or preemptible workers.
Cluster Setup & Initialization
What's the default region for Dataproc clusters?
Clusters default to the global region if not specified. The region is immutable after creation.
What is the staging bucket used for?
The stagingBucket in clusterConfig stores temporary files and job artifacts during cluster operations.
How do I run initialization scripts when my cluster starts?
Configure initializationActions in clusterConfig with a script path (typically a GCS URI like “gs://bucket/script.sh”) and optional timeoutSec.
How do I configure a service account for my cluster?
Set serviceAccount (email) and serviceAccountScopes (e.g., [“cloud-platform”]) within gceClusterConfig.
Compute Resources
How do I add GPU accelerators to my cluster?
Configure accelerators within masterConfig or workerConfig, specifying acceleratorType (e.g., “nvidia-tesla-k80”) and acceleratorCount. You may also need to specify a zone in gceClusterConfig.
Can I create a cluster with zero workers?
Yes, set dataproc:dataproc.allow.zero.workers to “true” in softwareConfig.overrideProperties and configure workerConfig.numInstances to 0.

Using a different cloud?

Explore analytics guides for other cloud providers: