Deploy GCP Dataproc Serverless Batches

The gcp:dataproc/batch:Batch resource, part of the Pulumi GCP provider, defines a Dataproc Serverless batch job: the Spark workload type, runtime configuration, and execution environment. This guide focuses on three capabilities: Spark and PySpark batch execution, SQL query execution from Cloud Storage, and automatic performance tuning.

Dataproc Serverless batches run without provisioning clusters, but they reference VPC subnetworks and Cloud Storage paths for code and data. The examples are intentionally small. Combine them with your own network configuration, storage buckets, and optional Metastore integration.

Run a Spark job with basic runtime configuration

Most Dataproc Serverless deployments start with a Spark batch that specifies the main class, runtime properties, and execution environment.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const exampleBatchSpark = new gcp.dataproc.Batch("example_batch_spark", {
    batchId: "tf-test-batch_15335",
    location: "us-central1",
    labels: {
        batch_test: "terraform",
    },
    runtimeConfig: {
        properties: {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environmentConfig: {
        executionConfig: {
            subnetworkUri: "default",
            ttl: "3600s",
            networkTags: ["tag1"],
        },
    },
    sparkBatch: {
        mainClass: "org.apache.spark.examples.SparkPi",
        args: ["10"],
        jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
    },
});

import pulumi
import pulumi_gcp as gcp

example_batch_spark = gcp.dataproc.Batch("example_batch_spark",
    batch_id="tf-test-batch_15335",
    location="us-central1",
    labels={
        "batch_test": "terraform",
    },
    runtime_config={
        "properties": {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environment_config={
        "execution_config": {
            "subnetwork_uri": "default",
            "ttl": "3600s",
            "network_tags": ["tag1"],
        },
    },
    spark_batch={
        "main_class": "org.apache.spark.examples.SparkPi",
        "args": ["10"],
        "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
    })

package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewBatch(ctx, "example_batch_spark", &dataproc.BatchArgs{
			BatchId:  pulumi.String("tf-test-batch_15335"),
			Location: pulumi.String("us-central1"),
			Labels: pulumi.StringMap{
				"batch_test": pulumi.String("terraform"),
			},
			RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
				Properties: pulumi.StringMap{
					"spark.dynamicAllocation.enabled": pulumi.String("false"),
					"spark.executor.instances":        pulumi.String("2"),
				},
			},
			EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
				ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
					SubnetworkUri: pulumi.String("default"),
					Ttl:           pulumi.String("3600s"),
					NetworkTags: pulumi.StringArray{
						pulumi.String("tag1"),
					},
				},
			},
			SparkBatch: &dataproc.BatchSparkBatchArgs{
				MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
				Args: pulumi.StringArray{
					pulumi.String("10"),
				},
				JarFileUris: pulumi.StringArray{
					pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var exampleBatchSpark = new Gcp.Dataproc.Batch("example_batch_spark", new()
    {
        BatchId = "tf-test-batch_15335",
        Location = "us-central1",
        Labels = 
        {
            { "batch_test", "terraform" },
        },
        RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
        {
            Properties = 
            {
                { "spark.dynamicAllocation.enabled", "false" },
                { "spark.executor.instances", "2" },
            },
        },
        EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
        {
            ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
            {
                SubnetworkUri = "default",
                Ttl = "3600s",
                NetworkTags = new[]
                {
                    "tag1",
                },
            },
        },
        SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
        {
            MainClass = "org.apache.spark.examples.SparkPi",
            Args = new[]
            {
                "10",
            },
            JarFileUris = new[]
            {
                "file:///usr/lib/spark/examples/jars/spark-examples.jar",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var exampleBatchSpark = new Batch("exampleBatchSpark", BatchArgs.builder()
            .batchId("tf-test-batch_15335")
            .location("us-central1")
            .labels(Map.of("batch_test", "terraform"))
            .runtimeConfig(BatchRuntimeConfigArgs.builder()
                .properties(Map.ofEntries(
                    Map.entry("spark.dynamicAllocation.enabled", "false"),
                    Map.entry("spark.executor.instances", "2")
                ))
                .build())
            .environmentConfig(BatchEnvironmentConfigArgs.builder()
                .executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
                    .subnetworkUri("default")
                    .ttl("3600s")
                    .networkTags("tag1")
                    .build())
                .build())
            .sparkBatch(BatchSparkBatchArgs.builder()
                .mainClass("org.apache.spark.examples.SparkPi")
                .args("10")
                .jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
                .build())
            .build());

    }
}

resources:
  exampleBatchSpark:
    type: gcp:dataproc:Batch
    name: example_batch_spark
    properties:
      batchId: tf-test-batch_15335
      location: us-central1
      labels:
        batch_test: terraform
      runtimeConfig:
        properties:
          spark.dynamicAllocation.enabled: 'false'
          spark.executor.instances: '2'
      environmentConfig:
        executionConfig:
          subnetworkUri: default
          ttl: 3600s
          networkTags:
            - tag1
      sparkBatch:
        mainClass: org.apache.spark.examples.SparkPi
        args:
          - '10'
        jarFileUris:
          - file:///usr/lib/spark/examples/jars/spark-examples.jar

The sparkBatch property defines the Spark application: mainClass specifies the entry point, jarFileUris lists the JARs to load, and args passes command-line arguments. The runtimeConfig controls Spark properties like executor count, while environmentConfig places the job in a specific subnetwork and sets execution limits via ttl.

Execute SQL queries from Cloud Storage

Analytics teams often store SQL queries in Cloud Storage and execute them against data lakes using Spark SQL.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const exampleBatchSparsql = new gcp.dataproc.Batch("example_batch_sparsql", {
    batchId: "tf-test-batch_20665",
    location: "us-central1",
    runtimeConfig: {
        properties: {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environmentConfig: {
        executionConfig: {
            subnetworkUri: "default",
        },
    },
    sparkSqlBatch: {
        queryFileUri: "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
        jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
        queryVariables: {
            name: "value",
        },
    },
});

import pulumi
import pulumi_gcp as gcp

example_batch_sparsql = gcp.dataproc.Batch("example_batch_sparsql",
    batch_id="tf-test-batch_20665",
    location="us-central1",
    runtime_config={
        "properties": {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environment_config={
        "execution_config": {
            "subnetwork_uri": "default",
        },
    },
    spark_sql_batch={
        "query_file_uri": "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
        "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
        "query_variables": {
            "name": "value",
        },
    })

package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewBatch(ctx, "example_batch_sparsql", &dataproc.BatchArgs{
			BatchId:  pulumi.String("tf-test-batch_20665"),
			Location: pulumi.String("us-central1"),
			RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
				Properties: pulumi.StringMap{
					"spark.dynamicAllocation.enabled": pulumi.String("false"),
					"spark.executor.instances":        pulumi.String("2"),
				},
			},
			EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
				ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
					SubnetworkUri: pulumi.String("default"),
				},
			},
			SparkSqlBatch: &dataproc.BatchSparkSqlBatchArgs{
				QueryFileUri: pulumi.String("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql"),
				JarFileUris: pulumi.StringArray{
					pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
				},
				QueryVariables: pulumi.StringMap{
					"name": pulumi.String("value"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var exampleBatchSparsql = new Gcp.Dataproc.Batch("example_batch_sparsql", new()
    {
        BatchId = "tf-test-batch_20665",
        Location = "us-central1",
        RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
        {
            Properties = 
            {
                { "spark.dynamicAllocation.enabled", "false" },
                { "spark.executor.instances", "2" },
            },
        },
        EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
        {
            ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
            {
                SubnetworkUri = "default",
            },
        },
        SparkSqlBatch = new Gcp.Dataproc.Inputs.BatchSparkSqlBatchArgs
        {
            QueryFileUri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
            JarFileUris = new[]
            {
                "file:///usr/lib/spark/examples/jars/spark-examples.jar",
            },
            QueryVariables = 
            {
                { "name", "value" },
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkSqlBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var exampleBatchSparsql = new Batch("exampleBatchSparsql", BatchArgs.builder()
            .batchId("tf-test-batch_20665")
            .location("us-central1")
            .runtimeConfig(BatchRuntimeConfigArgs.builder()
                .properties(Map.ofEntries(
                    Map.entry("spark.dynamicAllocation.enabled", "false"),
                    Map.entry("spark.executor.instances", "2")
                ))
                .build())
            .environmentConfig(BatchEnvironmentConfigArgs.builder()
                .executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
                    .subnetworkUri("default")
                    .build())
                .build())
            .sparkSqlBatch(BatchSparkSqlBatchArgs.builder()
                .queryFileUri("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql")
                .jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
                .queryVariables(Map.of("name", "value"))
                .build())
            .build());

    }
}

resources:
  exampleBatchSparsql:
    type: gcp:dataproc:Batch
    name: example_batch_sparsql
    properties:
      batchId: tf-test-batch_20665
      location: us-central1
      runtimeConfig:
        properties:
          spark.dynamicAllocation.enabled: 'false'
          spark.executor.instances: '2'
      environmentConfig:
        executionConfig:
          subnetworkUri: default
      sparkSqlBatch:
        queryFileUri: gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql
        jarFileUris:
          - file:///usr/lib/spark/examples/jars/spark-examples.jar
        queryVariables:
          name: value

The sparkSqlBatch property points to a SQL file in GCS via queryFileUri. The queryVariables property passes parameters into the query, allowing you to parameterize table names or filter conditions. Spark SQL executes the query and returns results without requiring you to write Spark code.

Run Python Spark jobs with dependencies

Python-based Spark applications often require additional Python files, archives, and data files distributed to executors.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const exampleBatchPyspark = new gcp.dataproc.Batch("example_batch_pyspark", {
    batchId: "tf-test-batch_85160",
    location: "us-central1",
    runtimeConfig: {
        properties: {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environmentConfig: {
        executionConfig: {
            subnetworkUri: "default",
        },
    },
    pysparkBatch: {
        mainPythonFileUri: "https://storage.googleapis.com/terraform-batches/test_util.py",
        args: ["10"],
        jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
        pythonFileUris: ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
        archiveUris: [
            "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
            "https://storage.googleapis.com/terraform-batches/animals.txt.jar",
            "https://storage.googleapis.com/terraform-batches/animals.txt",
        ],
        fileUris: ["https://storage.googleapis.com/terraform-batches/people.txt"],
    },
});

import pulumi
import pulumi_gcp as gcp

example_batch_pyspark = gcp.dataproc.Batch("example_batch_pyspark",
    batch_id="tf-test-batch_85160",
    location="us-central1",
    runtime_config={
        "properties": {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
    },
    environment_config={
        "execution_config": {
            "subnetwork_uri": "default",
        },
    },
    pyspark_batch={
        "main_python_file_uri": "https://storage.googleapis.com/terraform-batches/test_util.py",
        "args": ["10"],
        "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
        "python_file_uris": ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
        "archive_uris": [
            "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
            "https://storage.googleapis.com/terraform-batches/animals.txt.jar",
            "https://storage.googleapis.com/terraform-batches/animals.txt",
        ],
        "file_uris": ["https://storage.googleapis.com/terraform-batches/people.txt"],
    })

package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewBatch(ctx, "example_batch_pyspark", &dataproc.BatchArgs{
			BatchId:  pulumi.String("tf-test-batch_85160"),
			Location: pulumi.String("us-central1"),
			RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
				Properties: pulumi.StringMap{
					"spark.dynamicAllocation.enabled": pulumi.String("false"),
					"spark.executor.instances":        pulumi.String("2"),
				},
			},
			EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
				ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
					SubnetworkUri: pulumi.String("default"),
				},
			},
			PysparkBatch: &dataproc.BatchPysparkBatchArgs{
				MainPythonFileUri: pulumi.String("https://storage.googleapis.com/terraform-batches/test_util.py"),
				Args: pulumi.StringArray{
					pulumi.String("10"),
				},
				JarFileUris: pulumi.StringArray{
					pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
				},
				PythonFileUris: pulumi.StringArray{
					pulumi.String("gs://dataproc-examples/pyspark/hello-world/hello-world.py"),
				},
				ArchiveUris: pulumi.StringArray{
					pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked"),
					pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.jar"),
					pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt"),
				},
				FileUris: pulumi.StringArray{
					pulumi.String("https://storage.googleapis.com/terraform-batches/people.txt"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var exampleBatchPyspark = new Gcp.Dataproc.Batch("example_batch_pyspark", new()
    {
        BatchId = "tf-test-batch_85160",
        Location = "us-central1",
        RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
        {
            Properties = 
            {
                { "spark.dynamicAllocation.enabled", "false" },
                { "spark.executor.instances", "2" },
            },
        },
        EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
        {
            ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
            {
                SubnetworkUri = "default",
            },
        },
        PysparkBatch = new Gcp.Dataproc.Inputs.BatchPysparkBatchArgs
        {
            MainPythonFileUri = "https://storage.googleapis.com/terraform-batches/test_util.py",
            Args = new[]
            {
                "10",
            },
            JarFileUris = new[]
            {
                "file:///usr/lib/spark/examples/jars/spark-examples.jar",
            },
            PythonFileUris = new[]
            {
                "gs://dataproc-examples/pyspark/hello-world/hello-world.py",
            },
            ArchiveUris = new[]
            {
                "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
                "https://storage.googleapis.com/terraform-batches/animals.txt.jar",
                "https://storage.googleapis.com/terraform-batches/animals.txt",
            },
            FileUris = new[]
            {
                "https://storage.googleapis.com/terraform-batches/people.txt",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchPysparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var exampleBatchPyspark = new Batch("exampleBatchPyspark", BatchArgs.builder()
            .batchId("tf-test-batch_85160")
            .location("us-central1")
            .runtimeConfig(BatchRuntimeConfigArgs.builder()
                .properties(Map.ofEntries(
                    Map.entry("spark.dynamicAllocation.enabled", "false"),
                    Map.entry("spark.executor.instances", "2")
                ))
                .build())
            .environmentConfig(BatchEnvironmentConfigArgs.builder()
                .executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
                    .subnetworkUri("default")
                    .build())
                .build())
            .pysparkBatch(BatchPysparkBatchArgs.builder()
                .mainPythonFileUri("https://storage.googleapis.com/terraform-batches/test_util.py")
                .args("10")
                .jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
                .pythonFileUris("gs://dataproc-examples/pyspark/hello-world/hello-world.py")
                .archiveUris(                
                    "https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
                    "https://storage.googleapis.com/terraform-batches/animals.txt.jar",
                    "https://storage.googleapis.com/terraform-batches/animals.txt")
                .fileUris("https://storage.googleapis.com/terraform-batches/people.txt")
                .build())
            .build());

    }
}

resources:
  exampleBatchPyspark:
    type: gcp:dataproc:Batch
    name: example_batch_pyspark
    properties:
      batchId: tf-test-batch_85160
      location: us-central1
      runtimeConfig:
        properties:
          spark.dynamicAllocation.enabled: 'false'
          spark.executor.instances: '2'
      environmentConfig:
        executionConfig:
          subnetworkUri: default
      pysparkBatch:
        mainPythonFileUri: https://storage.googleapis.com/terraform-batches/test_util.py
        args:
          - '10'
        jarFileUris:
          - file:///usr/lib/spark/examples/jars/spark-examples.jar
        pythonFileUris:
          - gs://dataproc-examples/pyspark/hello-world/hello-world.py
        archiveUris:
          - https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked
          - https://storage.googleapis.com/terraform-batches/animals.txt.jar
          - https://storage.googleapis.com/terraform-batches/animals.txt
        fileUris:
          - https://storage.googleapis.com/terraform-batches/people.txt

The pysparkBatch property specifies the main Python file via mainPythonFileUri. The pythonFileUris property lists additional Python modules to distribute, archiveUris handles compressed dependencies (with optional extraction via the #unpacked suffix), and fileUris distributes data files. Dataproc automatically stages these files to all executors.

Enable automatic performance optimization

Spark jobs often require tuning for memory and scaling to achieve optimal performance. Autotuning analyzes job characteristics and adjusts configuration automatically.

import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";

const exampleBatchAutotuning = new gcp.dataproc.Batch("example_batch_autotuning", {
    batchId: "tf-test-batch_16199",
    location: "us-central1",
    labels: {
        batch_test: "terraform",
    },
    runtimeConfig: {
        version: "2.2",
        properties: {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
        cohort: "tf-dataproc-batch-example",
        autotuningConfig: {
            scenarios: [
                "SCALING",
                "MEMORY",
            ],
        },
    },
    environmentConfig: {
        executionConfig: {
            subnetworkUri: "default",
            ttl: "3600s",
        },
    },
    sparkBatch: {
        mainClass: "org.apache.spark.examples.SparkPi",
        args: ["10"],
        jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
    },
});

import pulumi
import pulumi_gcp as gcp

example_batch_autotuning = gcp.dataproc.Batch("example_batch_autotuning",
    batch_id="tf-test-batch_16199",
    location="us-central1",
    labels={
        "batch_test": "terraform",
    },
    runtime_config={
        "version": "2.2",
        "properties": {
            "spark.dynamicAllocation.enabled": "false",
            "spark.executor.instances": "2",
        },
        "cohort": "tf-dataproc-batch-example",
        "autotuning_config": {
            "scenarios": [
                "SCALING",
                "MEMORY",
            ],
        },
    },
    environment_config={
        "execution_config": {
            "subnetwork_uri": "default",
            "ttl": "3600s",
        },
    },
    spark_batch={
        "main_class": "org.apache.spark.examples.SparkPi",
        "args": ["10"],
        "jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
    })

package main

import (
	"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := dataproc.NewBatch(ctx, "example_batch_autotuning", &dataproc.BatchArgs{
			BatchId:  pulumi.String("tf-test-batch_16199"),
			Location: pulumi.String("us-central1"),
			Labels: pulumi.StringMap{
				"batch_test": pulumi.String("terraform"),
			},
			RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
				Version: pulumi.String("2.2"),
				Properties: pulumi.StringMap{
					"spark.dynamicAllocation.enabled": pulumi.String("false"),
					"spark.executor.instances":        pulumi.String("2"),
				},
				Cohort: pulumi.String("tf-dataproc-batch-example"),
				AutotuningConfig: &dataproc.BatchRuntimeConfigAutotuningConfigArgs{
					Scenarios: pulumi.StringArray{
						pulumi.String("SCALING"),
						pulumi.String("MEMORY"),
					},
				},
			},
			EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
				ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
					SubnetworkUri: pulumi.String("default"),
					Ttl:           pulumi.String("3600s"),
				},
			},
			SparkBatch: &dataproc.BatchSparkBatchArgs{
				MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
				Args: pulumi.StringArray{
					pulumi.String("10"),
				},
				JarFileUris: pulumi.StringArray{
					pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;

return await Deployment.RunAsync(() => 
{
    var exampleBatchAutotuning = new Gcp.Dataproc.Batch("example_batch_autotuning", new()
    {
        BatchId = "tf-test-batch_16199",
        Location = "us-central1",
        Labels = 
        {
            { "batch_test", "terraform" },
        },
        RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
        {
            Version = "2.2",
            Properties = 
            {
                { "spark.dynamicAllocation.enabled", "false" },
                { "spark.executor.instances", "2" },
            },
            Cohort = "tf-dataproc-batch-example",
            AutotuningConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigAutotuningConfigArgs
            {
                Scenarios = new[]
                {
                    "SCALING",
                    "MEMORY",
                },
            },
        },
        EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
        {
            ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
            {
                SubnetworkUri = "default",
                Ttl = "3600s",
            },
        },
        SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
        {
            MainClass = "org.apache.spark.examples.SparkPi",
            Args = new[]
            {
                "10",
            },
            JarFileUris = new[]
            {
                "file:///usr/lib/spark/examples/jars/spark-examples.jar",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigAutotuningConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var exampleBatchAutotuning = new Batch("exampleBatchAutotuning", BatchArgs.builder()
            .batchId("tf-test-batch_16199")
            .location("us-central1")
            .labels(Map.of("batch_test", "terraform"))
            .runtimeConfig(BatchRuntimeConfigArgs.builder()
                .version("2.2")
                .properties(Map.ofEntries(
                    Map.entry("spark.dynamicAllocation.enabled", "false"),
                    Map.entry("spark.executor.instances", "2")
                ))
                .cohort("tf-dataproc-batch-example")
                .autotuningConfig(BatchRuntimeConfigAutotuningConfigArgs.builder()
                    .scenarios(                    
                        "SCALING",
                        "MEMORY")
                    .build())
                .build())
            .environmentConfig(BatchEnvironmentConfigArgs.builder()
                .executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
                    .subnetworkUri("default")
                    .ttl("3600s")
                    .build())
                .build())
            .sparkBatch(BatchSparkBatchArgs.builder()
                .mainClass("org.apache.spark.examples.SparkPi")
                .args("10")
                .jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
                .build())
            .build());

    }
}

resources:
  exampleBatchAutotuning:
    type: gcp:dataproc:Batch
    name: example_batch_autotuning
    properties:
      batchId: tf-test-batch_16199
      location: us-central1
      labels:
        batch_test: terraform
      runtimeConfig:
        version: '2.2'
        properties:
          spark.dynamicAllocation.enabled: 'false'
          spark.executor.instances: '2'
        cohort: tf-dataproc-batch-example
        autotuningConfig:
          scenarios:
            - SCALING
            - MEMORY
      environmentConfig:
        executionConfig:
          subnetworkUri: default
          ttl: 3600s
      sparkBatch:
        mainClass: org.apache.spark.examples.SparkPi
        args:
          - '10'
        jarFileUris:
          - file:///usr/lib/spark/examples/jars/spark-examples.jar

The autotuningConfig property enables automatic optimization for specified scenarios: SCALING adjusts parallelism and executor count, while MEMORY tunes memory allocation. The cohort property groups related batches together, allowing Dataproc to learn from historical runs and apply optimizations across similar workloads.

Beyond these examples

These snippets focus on specific batch-level features: Spark, PySpark, SparkSQL, and SparkR batch types, runtime configuration and autotuning, and dependency distribution. They’re intentionally minimal rather than full data processing pipelines.

The examples may reference pre-existing infrastructure such as VPC subnetworks (examples use ‘default’), Cloud Storage buckets for code and data, and JAR files and Python scripts at specified URIs. They focus on configuring the batch rather than provisioning the surrounding infrastructure.

To keep things focused, common batch patterns are omitted, including:

KMS encryption keys (kmsKey)
Custom service accounts (serviceAccount)
Metastore integration (metastoreService)
Spark History Server configuration
Staging bucket configuration
Network tags and authentication configuration

These omissions are intentional: the goal is to illustrate how each batch feature is wired, not provide drop-in data processing modules. See the Dataproc Batch resource reference for all available configuration options.

Let's deploy GCP Dataproc Serverless Batches

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Batch Configuration & Types

What batch types are available and how do I choose?

Dataproc Serverless supports four batch types: sparkBatch (Java/Scala with JAR files), pysparkBatch (Python), sparkSqlBatch (SQL queries), and sparkRBatch (R scripts). Specify exactly one batch type per batch resource based on your workload language.

What are the naming requirements for batchId?

The batchId must be 4-63 characters long and match the pattern /[a-z][0-9]-/ (lowercase letters, numbers, and hyphens only).

Immutability & Updates

What properties can't I change after batch creation?

Most properties are immutable: batchId, location, project, environmentConfig, all batch type configurations (sparkBatch, pysparkBatch, etc.), and runtimeConfig. Changing these requires replacing the resource.

Why aren't all my labels showing up in the labels field?

The labels field is non-authoritative and only manages labels defined in your configuration. To see all labels on the resource (including those added by GCP services), use effectiveLabels.

Networking & Security

How do I configure VPC networking for my batch?

Set environmentConfig.executionConfig.subnetworkUri to your subnet (e.g., "default"), or use networkUri for the full network path. You can also specify networkTags for firewall rules.

How do I enable KMS encryption for my batch?

Configure environmentConfig.executionConfig.kmsKey with your KMS key name. Ensure the Dataproc service account has the roles/cloudkms.cryptoKeyEncrypterDecrypter role on the key, using dependsOn to enforce ordering.

How do I specify a custom service account?

Set environmentConfig.executionConfig.serviceAccount to your service account email (e.g., compute service account or custom account).

Runtime & Performance

How do I configure Spark properties for my batch?

Use runtimeConfig.properties to set Spark configurations like spark.dynamicAllocation.enabled or spark.executor.instances. All examples demonstrate this pattern.

How do I enable autotuning for my batch?

Configure runtimeConfig.autotuningConfig.scenarios with optimization scenarios like SCALING or MEMORY. Also set runtimeConfig.cohort to group batches for tuning.

How do I set a time-to-live (TTL) for batch execution?

Set environmentConfig.executionConfig.ttl to a duration string like "3600s" (1 hour). The batch will terminate after this time.

Integration & Monitoring

How do I integrate my batch with Dataproc Metastore?

Set environmentConfig.peripheralsConfig.metastoreService to your Metastore service name (e.g., from gcp.dataproc.MetastoreService).

How do I enable Spark History Server for my batch?

Configure environmentConfig.peripheralsConfig.sparkHistoryServerConfig.dataprocCluster with your Dataproc cluster ID. The cluster must have Spark history logging enabled.

Using a different cloud?

Explore analytics guides for other cloud providers:

AWS Guides Azure Guides