The gcp:dataproc/batch:Batch resource, part of the Pulumi GCP provider, defines a Dataproc Serverless batch job: the Spark workload type, runtime configuration, and execution environment. This guide focuses on three capabilities: Spark and PySpark batch execution, SQL query execution from Cloud Storage, and automatic performance tuning.
Dataproc Serverless batches run without provisioning clusters, but they reference VPC subnetworks and Cloud Storage paths for code and data. The examples are intentionally small. Combine them with your own network configuration, storage buckets, and optional Metastore integration.
Run a Spark job with basic runtime configuration
Most Dataproc Serverless deployments start with a Spark batch that specifies the main class, runtime properties, and execution environment.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchSpark = new gcp.dataproc.Batch("example_batch_spark", {
batchId: "tf-test-batch_15335",
location: "us-central1",
labels: {
batch_test: "terraform",
},
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
ttl: "3600s",
networkTags: ["tag1"],
},
},
sparkBatch: {
mainClass: "org.apache.spark.examples.SparkPi",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_spark = gcp.dataproc.Batch("example_batch_spark",
batch_id="tf-test-batch_15335",
location="us-central1",
labels={
"batch_test": "terraform",
},
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
"ttl": "3600s",
"network_tags": ["tag1"],
},
},
spark_batch={
"main_class": "org.apache.spark.examples.SparkPi",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_spark", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_15335"),
Location: pulumi.String("us-central1"),
Labels: pulumi.StringMap{
"batch_test": pulumi.String("terraform"),
},
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
Ttl: pulumi.String("3600s"),
NetworkTags: pulumi.StringArray{
pulumi.String("tag1"),
},
},
},
SparkBatch: &dataproc.BatchSparkBatchArgs{
MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchSpark = new Gcp.Dataproc.Batch("example_batch_spark", new()
{
BatchId = "tf-test-batch_15335",
Location = "us-central1",
Labels =
{
{ "batch_test", "terraform" },
},
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
Ttl = "3600s",
NetworkTags = new[]
{
"tag1",
},
},
},
SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
{
MainClass = "org.apache.spark.examples.SparkPi",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchSpark = new Batch("exampleBatchSpark", BatchArgs.builder()
.batchId("tf-test-batch_15335")
.location("us-central1")
.labels(Map.of("batch_test", "terraform"))
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.ttl("3600s")
.networkTags("tag1")
.build())
.build())
.sparkBatch(BatchSparkBatchArgs.builder()
.mainClass("org.apache.spark.examples.SparkPi")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.build())
.build());
}
}
resources:
exampleBatchSpark:
type: gcp:dataproc:Batch
name: example_batch_spark
properties:
batchId: tf-test-batch_15335
location: us-central1
labels:
batch_test: terraform
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
ttl: 3600s
networkTags:
- tag1
sparkBatch:
mainClass: org.apache.spark.examples.SparkPi
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
The sparkBatch property defines the Spark application: mainClass specifies the entry point, jarFileUris lists the JARs to load, and args passes command-line arguments. The runtimeConfig controls Spark properties like executor count, while environmentConfig places the job in a specific subnetwork and sets execution limits via ttl.
Execute SQL queries from Cloud Storage
Analytics teams often store SQL queries in Cloud Storage and execute them against data lakes using Spark SQL.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchSparsql = new gcp.dataproc.Batch("example_batch_sparsql", {
batchId: "tf-test-batch_20665",
location: "us-central1",
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
},
},
sparkSqlBatch: {
queryFileUri: "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
queryVariables: {
name: "value",
},
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_sparsql = gcp.dataproc.Batch("example_batch_sparsql",
batch_id="tf-test-batch_20665",
location="us-central1",
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
},
},
spark_sql_batch={
"query_file_uri": "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
"query_variables": {
"name": "value",
},
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_sparsql", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_20665"),
Location: pulumi.String("us-central1"),
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
},
},
SparkSqlBatch: &dataproc.BatchSparkSqlBatchArgs{
QueryFileUri: pulumi.String("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql"),
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
QueryVariables: pulumi.StringMap{
"name": pulumi.String("value"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchSparsql = new Gcp.Dataproc.Batch("example_batch_sparsql", new()
{
BatchId = "tf-test-batch_20665",
Location = "us-central1",
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
},
},
SparkSqlBatch = new Gcp.Dataproc.Inputs.BatchSparkSqlBatchArgs
{
QueryFileUri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
QueryVariables =
{
{ "name", "value" },
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkSqlBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchSparsql = new Batch("exampleBatchSparsql", BatchArgs.builder()
.batchId("tf-test-batch_20665")
.location("us-central1")
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.build())
.build())
.sparkSqlBatch(BatchSparkSqlBatchArgs.builder()
.queryFileUri("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.queryVariables(Map.of("name", "value"))
.build())
.build());
}
}
resources:
exampleBatchSparsql:
type: gcp:dataproc:Batch
name: example_batch_sparsql
properties:
batchId: tf-test-batch_20665
location: us-central1
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
sparkSqlBatch:
queryFileUri: gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
queryVariables:
name: value
The sparkSqlBatch property points to a SQL file in GCS via queryFileUri. The queryVariables property passes parameters into the query, allowing you to parameterize table names or filter conditions. Spark SQL executes the query and returns results without requiring you to write Spark code.
Run Python Spark jobs with dependencies
Python-based Spark applications often require additional Python files, archives, and data files distributed to executors.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchPyspark = new gcp.dataproc.Batch("example_batch_pyspark", {
batchId: "tf-test-batch_85160",
location: "us-central1",
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
},
},
pysparkBatch: {
mainPythonFileUri: "https://storage.googleapis.com/terraform-batches/test_util.py",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
pythonFileUris: ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
archiveUris: [
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
],
fileUris: ["https://storage.googleapis.com/terraform-batches/people.txt"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_pyspark = gcp.dataproc.Batch("example_batch_pyspark",
batch_id="tf-test-batch_85160",
location="us-central1",
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
},
},
pyspark_batch={
"main_python_file_uri": "https://storage.googleapis.com/terraform-batches/test_util.py",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
"python_file_uris": ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
"archive_uris": [
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
],
"file_uris": ["https://storage.googleapis.com/terraform-batches/people.txt"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_pyspark", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_85160"),
Location: pulumi.String("us-central1"),
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
},
},
PysparkBatch: &dataproc.BatchPysparkBatchArgs{
MainPythonFileUri: pulumi.String("https://storage.googleapis.com/terraform-batches/test_util.py"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
PythonFileUris: pulumi.StringArray{
pulumi.String("gs://dataproc-examples/pyspark/hello-world/hello-world.py"),
},
ArchiveUris: pulumi.StringArray{
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked"),
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.jar"),
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt"),
},
FileUris: pulumi.StringArray{
pulumi.String("https://storage.googleapis.com/terraform-batches/people.txt"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchPyspark = new Gcp.Dataproc.Batch("example_batch_pyspark", new()
{
BatchId = "tf-test-batch_85160",
Location = "us-central1",
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
},
},
PysparkBatch = new Gcp.Dataproc.Inputs.BatchPysparkBatchArgs
{
MainPythonFileUri = "https://storage.googleapis.com/terraform-batches/test_util.py",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
PythonFileUris = new[]
{
"gs://dataproc-examples/pyspark/hello-world/hello-world.py",
},
ArchiveUris = new[]
{
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
},
FileUris = new[]
{
"https://storage.googleapis.com/terraform-batches/people.txt",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchPysparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchPyspark = new Batch("exampleBatchPyspark", BatchArgs.builder()
.batchId("tf-test-batch_85160")
.location("us-central1")
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.build())
.build())
.pysparkBatch(BatchPysparkBatchArgs.builder()
.mainPythonFileUri("https://storage.googleapis.com/terraform-batches/test_util.py")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.pythonFileUris("gs://dataproc-examples/pyspark/hello-world/hello-world.py")
.archiveUris(
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt")
.fileUris("https://storage.googleapis.com/terraform-batches/people.txt")
.build())
.build());
}
}
resources:
exampleBatchPyspark:
type: gcp:dataproc:Batch
name: example_batch_pyspark
properties:
batchId: tf-test-batch_85160
location: us-central1
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
pysparkBatch:
mainPythonFileUri: https://storage.googleapis.com/terraform-batches/test_util.py
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
pythonFileUris:
- gs://dataproc-examples/pyspark/hello-world/hello-world.py
archiveUris:
- https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked
- https://storage.googleapis.com/terraform-batches/animals.txt.jar
- https://storage.googleapis.com/terraform-batches/animals.txt
fileUris:
- https://storage.googleapis.com/terraform-batches/people.txt
The pysparkBatch property specifies the main Python file via mainPythonFileUri. The pythonFileUris property lists additional Python modules to distribute, archiveUris handles compressed dependencies (with optional extraction via the #unpacked suffix), and fileUris distributes data files. Dataproc automatically stages these files to all executors.
Enable automatic performance optimization
Spark jobs often require tuning for memory and scaling to achieve optimal performance. Autotuning analyzes job characteristics and adjusts configuration automatically.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchAutotuning = new gcp.dataproc.Batch("example_batch_autotuning", {
batchId: "tf-test-batch_16199",
location: "us-central1",
labels: {
batch_test: "terraform",
},
runtimeConfig: {
version: "2.2",
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
cohort: "tf-dataproc-batch-example",
autotuningConfig: {
scenarios: [
"SCALING",
"MEMORY",
],
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
ttl: "3600s",
},
},
sparkBatch: {
mainClass: "org.apache.spark.examples.SparkPi",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_autotuning = gcp.dataproc.Batch("example_batch_autotuning",
batch_id="tf-test-batch_16199",
location="us-central1",
labels={
"batch_test": "terraform",
},
runtime_config={
"version": "2.2",
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
"cohort": "tf-dataproc-batch-example",
"autotuning_config": {
"scenarios": [
"SCALING",
"MEMORY",
],
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
"ttl": "3600s",
},
},
spark_batch={
"main_class": "org.apache.spark.examples.SparkPi",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_autotuning", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_16199"),
Location: pulumi.String("us-central1"),
Labels: pulumi.StringMap{
"batch_test": pulumi.String("terraform"),
},
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Version: pulumi.String("2.2"),
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
Cohort: pulumi.String("tf-dataproc-batch-example"),
AutotuningConfig: &dataproc.BatchRuntimeConfigAutotuningConfigArgs{
Scenarios: pulumi.StringArray{
pulumi.String("SCALING"),
pulumi.String("MEMORY"),
},
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
Ttl: pulumi.String("3600s"),
},
},
SparkBatch: &dataproc.BatchSparkBatchArgs{
MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchAutotuning = new Gcp.Dataproc.Batch("example_batch_autotuning", new()
{
BatchId = "tf-test-batch_16199",
Location = "us-central1",
Labels =
{
{ "batch_test", "terraform" },
},
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Version = "2.2",
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
Cohort = "tf-dataproc-batch-example",
AutotuningConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigAutotuningConfigArgs
{
Scenarios = new[]
{
"SCALING",
"MEMORY",
},
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
Ttl = "3600s",
},
},
SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
{
MainClass = "org.apache.spark.examples.SparkPi",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigAutotuningConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchAutotuning = new Batch("exampleBatchAutotuning", BatchArgs.builder()
.batchId("tf-test-batch_16199")
.location("us-central1")
.labels(Map.of("batch_test", "terraform"))
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.version("2.2")
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.cohort("tf-dataproc-batch-example")
.autotuningConfig(BatchRuntimeConfigAutotuningConfigArgs.builder()
.scenarios(
"SCALING",
"MEMORY")
.build())
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.ttl("3600s")
.build())
.build())
.sparkBatch(BatchSparkBatchArgs.builder()
.mainClass("org.apache.spark.examples.SparkPi")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.build())
.build());
}
}
resources:
exampleBatchAutotuning:
type: gcp:dataproc:Batch
name: example_batch_autotuning
properties:
batchId: tf-test-batch_16199
location: us-central1
labels:
batch_test: terraform
runtimeConfig:
version: '2.2'
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
cohort: tf-dataproc-batch-example
autotuningConfig:
scenarios:
- SCALING
- MEMORY
environmentConfig:
executionConfig:
subnetworkUri: default
ttl: 3600s
sparkBatch:
mainClass: org.apache.spark.examples.SparkPi
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
The autotuningConfig property enables automatic optimization for specified scenarios: SCALING adjusts parallelism and executor count, while MEMORY tunes memory allocation. The cohort property groups related batches together, allowing Dataproc to learn from historical runs and apply optimizations across similar workloads.
Beyond these examples
These snippets focus on specific batch-level features: Spark, PySpark, SparkSQL, and SparkR batch types, runtime configuration and autotuning, and dependency distribution. They’re intentionally minimal rather than full data processing pipelines.
The examples may reference pre-existing infrastructure such as VPC subnetworks (examples use ‘default’), Cloud Storage buckets for code and data, and JAR files and Python scripts at specified URIs. They focus on configuring the batch rather than provisioning the surrounding infrastructure.
To keep things focused, common batch patterns are omitted, including:
- KMS encryption keys (kmsKey)
- Custom service accounts (serviceAccount)
- Metastore integration (metastoreService)
- Spark History Server configuration
- Staging bucket configuration
- Network tags and authentication configuration
These omissions are intentional: the goal is to illustrate how each batch feature is wired, not provide drop-in data processing modules. See the Dataproc Batch resource reference for all available configuration options.
Let's deploy GCP Dataproc Serverless Batches
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Batch Configuration & Types
sparkBatch (Java/Scala with JAR files), pysparkBatch (Python), sparkSqlBatch (SQL queries), and sparkRBatch (R scripts). Specify exactly one batch type per batch resource based on your workload language.batchId must be 4-63 characters long and match the pattern /[a-z][0-9]-/ (lowercase letters, numbers, and hyphens only).Immutability & Updates
batchId, location, project, environmentConfig, all batch type configurations (sparkBatch, pysparkBatch, etc.), and runtimeConfig. Changing these requires replacing the resource.labels field is non-authoritative and only manages labels defined in your configuration. To see all labels on the resource (including those added by GCP services), use effectiveLabels.Networking & Security
environmentConfig.executionConfig.subnetworkUri to your subnet (e.g., "default"), or use networkUri for the full network path. You can also specify networkTags for firewall rules.environmentConfig.executionConfig.kmsKey with your KMS key name. Ensure the Dataproc service account has the roles/cloudkms.cryptoKeyEncrypterDecrypter role on the key, using dependsOn to enforce ordering.environmentConfig.executionConfig.serviceAccount to your service account email (e.g., compute service account or custom account).Runtime & Performance
runtimeConfig.properties to set Spark configurations like spark.dynamicAllocation.enabled or spark.executor.instances. All examples demonstrate this pattern.runtimeConfig.autotuningConfig.scenarios with optimization scenarios like SCALING or MEMORY. Also set runtimeConfig.cohort to group batches for tuning.environmentConfig.executionConfig.ttl to a duration string like "3600s" (1 hour). The batch will terminate after this time.Integration & Monitoring
environmentConfig.peripheralsConfig.metastoreService to your Metastore service name (e.g., from gcp.dataproc.MetastoreService).environmentConfig.peripheralsConfig.sparkHistoryServerConfig.dataprocCluster with your Dataproc cluster ID. The cluster must have Spark history logging enabled.Using a different cloud?
Explore analytics guides for other cloud providers: