The gcp:dataproc/batch:Batch resource, part of the Pulumi GCP provider, defines Dataproc Serverless batch jobs that run Spark workloads without provisioning clusters. This guide focuses on three capabilities: Spark and PySpark job execution, SQL query processing from Cloud Storage, and automatic performance tuning.
Batches reference VPC subnetworks, Cloud Storage paths for code and data, and optionally integrate with Metastore services and Spark History Servers. The examples are intentionally small. Combine them with your own networking, storage, and security configuration.
Run a Spark job with runtime properties
Most Dataproc Serverless workloads start with a basic Spark batch that specifies the main class, runtime properties, and execution environment.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchSpark = new gcp.dataproc.Batch("example_batch_spark", {
batchId: "tf-test-batch_40289",
location: "us-central1",
labels: {
batch_test: "terraform",
},
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
ttl: "3600s",
networkTags: ["tag1"],
},
},
sparkBatch: {
mainClass: "org.apache.spark.examples.SparkPi",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_spark = gcp.dataproc.Batch("example_batch_spark",
batch_id="tf-test-batch_40289",
location="us-central1",
labels={
"batch_test": "terraform",
},
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
"ttl": "3600s",
"network_tags": ["tag1"],
},
},
spark_batch={
"main_class": "org.apache.spark.examples.SparkPi",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_spark", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_40289"),
Location: pulumi.String("us-central1"),
Labels: pulumi.StringMap{
"batch_test": pulumi.String("terraform"),
},
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
Ttl: pulumi.String("3600s"),
NetworkTags: pulumi.StringArray{
pulumi.String("tag1"),
},
},
},
SparkBatch: &dataproc.BatchSparkBatchArgs{
MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchSpark = new Gcp.Dataproc.Batch("example_batch_spark", new()
{
BatchId = "tf-test-batch_40289",
Location = "us-central1",
Labels =
{
{ "batch_test", "terraform" },
},
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
Ttl = "3600s",
NetworkTags = new[]
{
"tag1",
},
},
},
SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
{
MainClass = "org.apache.spark.examples.SparkPi",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchSpark = new Batch("exampleBatchSpark", BatchArgs.builder()
.batchId("tf-test-batch_40289")
.location("us-central1")
.labels(Map.of("batch_test", "terraform"))
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.ttl("3600s")
.networkTags("tag1")
.build())
.build())
.sparkBatch(BatchSparkBatchArgs.builder()
.mainClass("org.apache.spark.examples.SparkPi")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.build())
.build());
}
}
resources:
exampleBatchSpark:
type: gcp:dataproc:Batch
name: example_batch_spark
properties:
batchId: tf-test-batch_40289
location: us-central1
labels:
batch_test: terraform
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
ttl: 3600s
networkTags:
- tag1
sparkBatch:
mainClass: org.apache.spark.examples.SparkPi
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
The sparkBatch property defines the Spark application: mainClass specifies the entry point, jarFileUris lists the JARs to load, and args passes command-line arguments. The runtimeConfig sets Spark properties like executor count, while environmentConfig places the job in a specific subnetwork and sets time-to-live limits.
Execute SQL queries from Cloud Storage
Analytics teams often store SQL queries in Cloud Storage and execute them against data lakes without managing infrastructure.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchSparsql = new gcp.dataproc.Batch("example_batch_sparsql", {
batchId: "tf-test-batch_33395",
location: "us-central1",
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
},
},
sparkSqlBatch: {
queryFileUri: "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
queryVariables: {
name: "value",
},
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_sparsql = gcp.dataproc.Batch("example_batch_sparsql",
batch_id="tf-test-batch_33395",
location="us-central1",
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
},
},
spark_sql_batch={
"query_file_uri": "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
"query_variables": {
"name": "value",
},
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_sparsql", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_33395"),
Location: pulumi.String("us-central1"),
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
},
},
SparkSqlBatch: &dataproc.BatchSparkSqlBatchArgs{
QueryFileUri: pulumi.String("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql"),
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
QueryVariables: pulumi.StringMap{
"name": pulumi.String("value"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchSparsql = new Gcp.Dataproc.Batch("example_batch_sparsql", new()
{
BatchId = "tf-test-batch_33395",
Location = "us-central1",
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
},
},
SparkSqlBatch = new Gcp.Dataproc.Inputs.BatchSparkSqlBatchArgs
{
QueryFileUri = "gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql",
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
QueryVariables =
{
{ "name", "value" },
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkSqlBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchSparsql = new Batch("exampleBatchSparsql", BatchArgs.builder()
.batchId("tf-test-batch_33395")
.location("us-central1")
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.build())
.build())
.sparkSqlBatch(BatchSparkSqlBatchArgs.builder()
.queryFileUri("gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.queryVariables(Map.of("name", "value"))
.build())
.build());
}
}
resources:
exampleBatchSparsql:
type: gcp:dataproc:Batch
name: example_batch_sparsql
properties:
batchId: tf-test-batch_33395
location: us-central1
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
sparkSqlBatch:
queryFileUri: gs://dataproc-examples/spark-sql/natality/cigarette_correlations.sql
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
queryVariables:
name: value
The sparkSqlBatch property points to a SQL file in GCS via queryFileUri. The queryVariables property passes key-value pairs that substitute into the SQL query at runtime, enabling parameterized queries without hardcoding values.
Run Python Spark jobs with dependencies
Python-based data pipelines need to package dependencies and reference external files during execution.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchPyspark = new gcp.dataproc.Batch("example_batch_pyspark", {
batchId: "tf-test-batch_76044",
location: "us-central1",
runtimeConfig: {
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
},
},
pysparkBatch: {
mainPythonFileUri: "https://storage.googleapis.com/terraform-batches/test_util.py",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
pythonFileUris: ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
archiveUris: [
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
],
fileUris: ["https://storage.googleapis.com/terraform-batches/people.txt"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_pyspark = gcp.dataproc.Batch("example_batch_pyspark",
batch_id="tf-test-batch_76044",
location="us-central1",
runtime_config={
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
},
},
pyspark_batch={
"main_python_file_uri": "https://storage.googleapis.com/terraform-batches/test_util.py",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
"python_file_uris": ["gs://dataproc-examples/pyspark/hello-world/hello-world.py"],
"archive_uris": [
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
],
"file_uris": ["https://storage.googleapis.com/terraform-batches/people.txt"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_pyspark", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_76044"),
Location: pulumi.String("us-central1"),
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
},
},
PysparkBatch: &dataproc.BatchPysparkBatchArgs{
MainPythonFileUri: pulumi.String("https://storage.googleapis.com/terraform-batches/test_util.py"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
PythonFileUris: pulumi.StringArray{
pulumi.String("gs://dataproc-examples/pyspark/hello-world/hello-world.py"),
},
ArchiveUris: pulumi.StringArray{
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked"),
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt.jar"),
pulumi.String("https://storage.googleapis.com/terraform-batches/animals.txt"),
},
FileUris: pulumi.StringArray{
pulumi.String("https://storage.googleapis.com/terraform-batches/people.txt"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchPyspark = new Gcp.Dataproc.Batch("example_batch_pyspark", new()
{
BatchId = "tf-test-batch_76044",
Location = "us-central1",
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
},
},
PysparkBatch = new Gcp.Dataproc.Inputs.BatchPysparkBatchArgs
{
MainPythonFileUri = "https://storage.googleapis.com/terraform-batches/test_util.py",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
PythonFileUris = new[]
{
"gs://dataproc-examples/pyspark/hello-world/hello-world.py",
},
ArchiveUris = new[]
{
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt",
},
FileUris = new[]
{
"https://storage.googleapis.com/terraform-batches/people.txt",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchPysparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchPyspark = new Batch("exampleBatchPyspark", BatchArgs.builder()
.batchId("tf-test-batch_76044")
.location("us-central1")
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.build())
.build())
.pysparkBatch(BatchPysparkBatchArgs.builder()
.mainPythonFileUri("https://storage.googleapis.com/terraform-batches/test_util.py")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.pythonFileUris("gs://dataproc-examples/pyspark/hello-world/hello-world.py")
.archiveUris(
"https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked",
"https://storage.googleapis.com/terraform-batches/animals.txt.jar",
"https://storage.googleapis.com/terraform-batches/animals.txt")
.fileUris("https://storage.googleapis.com/terraform-batches/people.txt")
.build())
.build());
}
}
resources:
exampleBatchPyspark:
type: gcp:dataproc:Batch
name: example_batch_pyspark
properties:
batchId: tf-test-batch_76044
location: us-central1
runtimeConfig:
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
environmentConfig:
executionConfig:
subnetworkUri: default
pysparkBatch:
mainPythonFileUri: https://storage.googleapis.com/terraform-batches/test_util.py
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
pythonFileUris:
- gs://dataproc-examples/pyspark/hello-world/hello-world.py
archiveUris:
- https://storage.googleapis.com/terraform-batches/animals.txt.tar.gz#unpacked
- https://storage.googleapis.com/terraform-batches/animals.txt.jar
- https://storage.googleapis.com/terraform-batches/animals.txt
fileUris:
- https://storage.googleapis.com/terraform-batches/people.txt
The pysparkBatch property specifies the main Python file and additional dependencies. The pythonFileUris property lists Python modules to include, archiveUris extracts compressed files into the working directory, and fileUris makes data files available to the job. Archives can use fragment identifiers (e.g., #unpacked) to control extraction behavior.
Enable automatic performance optimization
Production workloads benefit from automatic tuning that adjusts resource allocation and memory settings based on job characteristics.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const exampleBatchAutotuning = new gcp.dataproc.Batch("example_batch_autotuning", {
batchId: "tf-test-batch_8270",
location: "us-central1",
labels: {
batch_test: "terraform",
},
runtimeConfig: {
version: "2.2",
properties: {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
cohort: "tf-dataproc-batch-example",
autotuningConfig: {
scenarios: [
"SCALING",
"MEMORY",
],
},
},
environmentConfig: {
executionConfig: {
subnetworkUri: "default",
ttl: "3600s",
},
},
sparkBatch: {
mainClass: "org.apache.spark.examples.SparkPi",
args: ["10"],
jarFileUris: ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
},
});
import pulumi
import pulumi_gcp as gcp
example_batch_autotuning = gcp.dataproc.Batch("example_batch_autotuning",
batch_id="tf-test-batch_8270",
location="us-central1",
labels={
"batch_test": "terraform",
},
runtime_config={
"version": "2.2",
"properties": {
"spark.dynamicAllocation.enabled": "false",
"spark.executor.instances": "2",
},
"cohort": "tf-dataproc-batch-example",
"autotuning_config": {
"scenarios": [
"SCALING",
"MEMORY",
],
},
},
environment_config={
"execution_config": {
"subnetwork_uri": "default",
"ttl": "3600s",
},
},
spark_batch={
"main_class": "org.apache.spark.examples.SparkPi",
"args": ["10"],
"jar_file_uris": ["file:///usr/lib/spark/examples/jars/spark-examples.jar"],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewBatch(ctx, "example_batch_autotuning", &dataproc.BatchArgs{
BatchId: pulumi.String("tf-test-batch_8270"),
Location: pulumi.String("us-central1"),
Labels: pulumi.StringMap{
"batch_test": pulumi.String("terraform"),
},
RuntimeConfig: &dataproc.BatchRuntimeConfigArgs{
Version: pulumi.String("2.2"),
Properties: pulumi.StringMap{
"spark.dynamicAllocation.enabled": pulumi.String("false"),
"spark.executor.instances": pulumi.String("2"),
},
Cohort: pulumi.String("tf-dataproc-batch-example"),
AutotuningConfig: &dataproc.BatchRuntimeConfigAutotuningConfigArgs{
Scenarios: pulumi.StringArray{
pulumi.String("SCALING"),
pulumi.String("MEMORY"),
},
},
},
EnvironmentConfig: &dataproc.BatchEnvironmentConfigArgs{
ExecutionConfig: &dataproc.BatchEnvironmentConfigExecutionConfigArgs{
SubnetworkUri: pulumi.String("default"),
Ttl: pulumi.String("3600s"),
},
},
SparkBatch: &dataproc.BatchSparkBatchArgs{
MainClass: pulumi.String("org.apache.spark.examples.SparkPi"),
Args: pulumi.StringArray{
pulumi.String("10"),
},
JarFileUris: pulumi.StringArray{
pulumi.String("file:///usr/lib/spark/examples/jars/spark-examples.jar"),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var exampleBatchAutotuning = new Gcp.Dataproc.Batch("example_batch_autotuning", new()
{
BatchId = "tf-test-batch_8270",
Location = "us-central1",
Labels =
{
{ "batch_test", "terraform" },
},
RuntimeConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigArgs
{
Version = "2.2",
Properties =
{
{ "spark.dynamicAllocation.enabled", "false" },
{ "spark.executor.instances", "2" },
},
Cohort = "tf-dataproc-batch-example",
AutotuningConfig = new Gcp.Dataproc.Inputs.BatchRuntimeConfigAutotuningConfigArgs
{
Scenarios = new[]
{
"SCALING",
"MEMORY",
},
},
},
EnvironmentConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigArgs
{
ExecutionConfig = new Gcp.Dataproc.Inputs.BatchEnvironmentConfigExecutionConfigArgs
{
SubnetworkUri = "default",
Ttl = "3600s",
},
},
SparkBatch = new Gcp.Dataproc.Inputs.BatchSparkBatchArgs
{
MainClass = "org.apache.spark.examples.SparkPi",
Args = new[]
{
"10",
},
JarFileUris = new[]
{
"file:///usr/lib/spark/examples/jars/spark-examples.jar",
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.Batch;
import com.pulumi.gcp.dataproc.BatchArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchRuntimeConfigAutotuningConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchEnvironmentConfigExecutionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.BatchSparkBatchArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var exampleBatchAutotuning = new Batch("exampleBatchAutotuning", BatchArgs.builder()
.batchId("tf-test-batch_8270")
.location("us-central1")
.labels(Map.of("batch_test", "terraform"))
.runtimeConfig(BatchRuntimeConfigArgs.builder()
.version("2.2")
.properties(Map.ofEntries(
Map.entry("spark.dynamicAllocation.enabled", "false"),
Map.entry("spark.executor.instances", "2")
))
.cohort("tf-dataproc-batch-example")
.autotuningConfig(BatchRuntimeConfigAutotuningConfigArgs.builder()
.scenarios(
"SCALING",
"MEMORY")
.build())
.build())
.environmentConfig(BatchEnvironmentConfigArgs.builder()
.executionConfig(BatchEnvironmentConfigExecutionConfigArgs.builder()
.subnetworkUri("default")
.ttl("3600s")
.build())
.build())
.sparkBatch(BatchSparkBatchArgs.builder()
.mainClass("org.apache.spark.examples.SparkPi")
.args("10")
.jarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar")
.build())
.build());
}
}
resources:
exampleBatchAutotuning:
type: gcp:dataproc:Batch
name: example_batch_autotuning
properties:
batchId: tf-test-batch_8270
location: us-central1
labels:
batch_test: terraform
runtimeConfig:
version: '2.2'
properties:
spark.dynamicAllocation.enabled: 'false'
spark.executor.instances: '2'
cohort: tf-dataproc-batch-example
autotuningConfig:
scenarios:
- SCALING
- MEMORY
environmentConfig:
executionConfig:
subnetworkUri: default
ttl: 3600s
sparkBatch:
mainClass: org.apache.spark.examples.SparkPi
args:
- '10'
jarFileUris:
- file:///usr/lib/spark/examples/jars/spark-examples.jar
The autotuningConfig property enables automatic optimization for specified scenarios. Setting scenarios to ["SCALING", "MEMORY"] allows Dataproc to adjust executor count and memory allocation. The cohort property groups related batches for shared tuning insights across similar workloads.
Beyond these examples
These snippets focus on specific batch-level features: Spark, PySpark, SparkSQL, and SparkR batch execution, runtime property configuration, and automatic performance tuning. They’re intentionally minimal rather than full data pipeline deployments.
The examples may reference pre-existing infrastructure such as VPC subnetworks (examples use ‘default’), Cloud Storage buckets for code and data, and JAR files and Python scripts at specified URIs. They focus on configuring the batch job rather than provisioning everything around it.
To keep things focused, common batch patterns are omitted, including:
- KMS encryption keys (kmsKey)
- Custom service accounts (serviceAccount)
- Staging buckets for intermediate data
- Metastore service integration
- Spark History Server configuration
- Network tags and authentication configuration
These omissions are intentional: the goal is to illustrate how each batch feature is wired, not provide drop-in data pipeline modules. See the Dataproc Batch resource reference for all available configuration options.
Let's deploy GCP Dataproc Serverless Batches
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Immutability
batchId, location, project, environmentConfig, runtimeConfig, and all batch type configurations (sparkBatch, pysparkBatch, etc.). Changing these requires recreating the batch.batchId must be 4-63 characters long and contain only lowercase letters, numbers, and hyphens (matching pattern /[a-z][0-9]-/).runtimeConfig.properties, such as spark.executor.instances or spark.dynamicAllocation.enabled.Batch Types & Workloads
You can create four types:
- Spark (
sparkBatch) - Java/Scala withmainClass - PySpark (
pysparkBatch) - Python withmainPythonFileUri - SparkSQL (
sparkSqlBatch) - SQL queries withqueryFileUri - SparkR (
sparkRBatch) - R scripts withmainRFileUri
sparkSqlBatch.queryVariables to define key-value pairs that will be substituted in your query.#unpacked to archive URIs in archiveUris (e.g., animals.txt.tar.gz#unpacked).Advanced Features
runtimeConfig.autotuningConfig with scenarios like SCALING and MEMORY. Optionally set a cohort to group related batches.environmentConfig.executionConfig.kmsKey to your KMS key name. The Dataproc service account needs the roles/cloudkms.cryptoKeyEncrypterDecrypter role, which you should configure with dependsOn to ensure proper ordering.environmentConfig.peripheralsConfig.metastoreService to your Dataproc Metastore service name.environmentConfig.peripheralsConfig.sparkHistoryServerConfig.dataprocCluster to your Dataproc cluster ID.Labels & Metadata
labels field is non-authoritative and only manages labels defined in your configuration. To see all labels on the resource (including those added by other clients or services), use the effectiveLabels output property.Using a different cloud?
Explore analytics guides for other cloud providers: