The aws:glue/job:Job resource, part of the Pulumi AWS provider, defines a Glue job: its script location, execution environment (Python ETL, Python shell, streaming, Ray, or Scala), and compute resources. This guide focuses on four capabilities: Python ETL jobs with worker scaling, Python shell jobs for lightweight processing, streaming jobs for continuous data, and CloudWatch logging and metrics.
Glue jobs require IAM execution roles, S3-hosted scripts, and optionally Glue connections for database access. The examples are intentionally small. Combine them with your own IAM policies, S3 buckets, and data sources.
Run Python ETL jobs with worker scaling
Most Glue deployments transform data with Python scripts, configuring worker types and counts to balance cost and performance.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
name: "glue-job-role",
assumeRolePolicy: JSON.stringify({
Version: "2012-10-17",
Statement: [{
Action: "sts:AssumeRole",
Effect: "Allow",
Principal: {
Service: "glue.amazonaws.com",
},
}],
}),
});
const etlJob = new aws.glue.Job("etl_job", {
name: "example-etl-job",
description: "An example Glue ETL job",
roleArn: glueJobRole.arn,
glueVersion: "5.0",
maxRetries: 0,
timeout: 2880,
numberOfWorkers: 2,
workerType: "G.1X",
connections: [example.name],
executionClass: "STANDARD",
command: {
scriptLocation: `s3://${glueScripts.bucket}/jobs/etl_job.py`,
name: "glueetl",
pythonVersion: "3",
},
notificationProperty: {
notifyDelayAfter: 3,
},
defaultArguments: {
"--job-language": "python",
"--continuous-log-logGroup": "/aws-glue/jobs",
"--enable-continuous-cloudwatch-log": "true",
"--enable-continuous-log-filter": "true",
"--enable-metrics": "",
"--enable-auto-scaling": "true",
},
executionProperty: {
maxConcurrentRuns: 1,
},
tags: {
ManagedBy: "AWS",
},
});
const glueEtlScript = new aws.s3.BucketObjectv2("glue_etl_script", {
bucket: glueScripts.id,
key: "jobs/etl_job.py",
source: new pulumi.asset.FileAsset("jobs/etl_job.py"),
});
import pulumi
import json
import pulumi_aws as aws
# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
name="glue-job-role",
assume_role_policy=json.dumps({
"Version": "2012-10-17",
"Statement": [{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com",
},
}],
}))
etl_job = aws.glue.Job("etl_job",
name="example-etl-job",
description="An example Glue ETL job",
role_arn=glue_job_role.arn,
glue_version="5.0",
max_retries=0,
timeout=2880,
number_of_workers=2,
worker_type="G.1X",
connections=[example["name"]],
execution_class="STANDARD",
command={
"script_location": f"s3://{glue_scripts['bucket']}/jobs/etl_job.py",
"name": "glueetl",
"python_version": "3",
},
notification_property={
"notify_delay_after": 3,
},
default_arguments={
"--job-language": "python",
"--continuous-log-logGroup": "/aws-glue/jobs",
"--enable-continuous-cloudwatch-log": "true",
"--enable-continuous-log-filter": "true",
"--enable-metrics": "",
"--enable-auto-scaling": "true",
},
execution_property={
"max_concurrent_runs": 1,
},
tags={
"ManagedBy": "AWS",
})
glue_etl_script = aws.s3.BucketObjectv2("glue_etl_script",
bucket=glue_scripts["id"],
key="jobs/etl_job.py",
source=pulumi.FileAsset("jobs/etl_job.py"))
package main
import (
"encoding/json"
"fmt"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
tmpJSON0, err := json.Marshal(map[string]interface{}{
"Version": "2012-10-17",
"Statement": []map[string]interface{}{
map[string]interface{}{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": map[string]interface{}{
"Service": "glue.amazonaws.com",
},
},
},
})
if err != nil {
return err
}
json0 := string(tmpJSON0)
// IAM role for Glue jobs
glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
Name: pulumi.String("glue-job-role"),
AssumeRolePolicy: pulumi.String(json0),
})
if err != nil {
return err
}
_, err = glue.NewJob(ctx, "etl_job", &glue.JobArgs{
Name: pulumi.String("example-etl-job"),
Description: pulumi.String("An example Glue ETL job"),
RoleArn: glueJobRole.Arn,
GlueVersion: pulumi.String("5.0"),
MaxRetries: pulumi.Int(0),
Timeout: pulumi.Int(2880),
NumberOfWorkers: pulumi.Int(2),
WorkerType: pulumi.String("G.1X"),
Connections: pulumi.StringArray{
example.Name,
},
ExecutionClass: pulumi.String("STANDARD"),
Command: &glue.JobCommandArgs{
ScriptLocation: pulumi.Sprintf("s3://%v/jobs/etl_job.py", glueScripts.Bucket),
Name: pulumi.String("glueetl"),
PythonVersion: pulumi.String("3"),
},
NotificationProperty: &glue.JobNotificationPropertyArgs{
NotifyDelayAfter: pulumi.Int(3),
},
DefaultArguments: pulumi.StringMap{
"--job-language": pulumi.String("python"),
"--continuous-log-logGroup": pulumi.String("/aws-glue/jobs"),
"--enable-continuous-cloudwatch-log": pulumi.String("true"),
"--enable-continuous-log-filter": pulumi.String("true"),
"--enable-metrics": pulumi.String(""),
"--enable-auto-scaling": pulumi.String("true"),
},
ExecutionProperty: &glue.JobExecutionPropertyArgs{
MaxConcurrentRuns: pulumi.Int(1),
},
Tags: pulumi.StringMap{
"ManagedBy": pulumi.String("AWS"),
},
})
if err != nil {
return err
}
_, err = s3.NewBucketObjectv2(ctx, "glue_etl_script", &s3.BucketObjectv2Args{
Bucket: pulumi.Any(glueScripts.Id),
Key: pulumi.String("jobs/etl_job.py"),
Source: pulumi.NewFileAsset("jobs/etl_job.py"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
// IAM role for Glue jobs
var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
{
Name = "glue-job-role",
AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
{
["Version"] = "2012-10-17",
["Statement"] = new[]
{
new Dictionary<string, object?>
{
["Action"] = "sts:AssumeRole",
["Effect"] = "Allow",
["Principal"] = new Dictionary<string, object?>
{
["Service"] = "glue.amazonaws.com",
},
},
},
}),
});
var etlJob = new Aws.Glue.Job("etl_job", new()
{
Name = "example-etl-job",
Description = "An example Glue ETL job",
RoleArn = glueJobRole.Arn,
GlueVersion = "5.0",
MaxRetries = 0,
Timeout = 2880,
NumberOfWorkers = 2,
WorkerType = "G.1X",
Connections = new[]
{
example.Name,
},
ExecutionClass = "STANDARD",
Command = new Aws.Glue.Inputs.JobCommandArgs
{
ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/etl_job.py",
Name = "glueetl",
PythonVersion = "3",
},
NotificationProperty = new Aws.Glue.Inputs.JobNotificationPropertyArgs
{
NotifyDelayAfter = 3,
},
DefaultArguments =
{
{ "--job-language", "python" },
{ "--continuous-log-logGroup", "/aws-glue/jobs" },
{ "--enable-continuous-cloudwatch-log", "true" },
{ "--enable-continuous-log-filter", "true" },
{ "--enable-metrics", "" },
{ "--enable-auto-scaling", "true" },
},
ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
{
MaxConcurrentRuns = 1,
},
Tags =
{
{ "ManagedBy", "AWS" },
},
});
var glueEtlScript = new Aws.S3.BucketObjectv2("glue_etl_script", new()
{
Bucket = glueScripts.Id,
Key = "jobs/etl_job.py",
Source = new FileAsset("jobs/etl_job.py"),
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobNotificationPropertyArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
// IAM role for Glue jobs
var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
.name("glue-job-role")
.assumeRolePolicy(serializeJson(
jsonObject(
jsonProperty("Version", "2012-10-17"),
jsonProperty("Statement", jsonArray(jsonObject(
jsonProperty("Action", "sts:AssumeRole"),
jsonProperty("Effect", "Allow"),
jsonProperty("Principal", jsonObject(
jsonProperty("Service", "glue.amazonaws.com")
))
)))
)))
.build());
var etlJob = new Job("etlJob", JobArgs.builder()
.name("example-etl-job")
.description("An example Glue ETL job")
.roleArn(glueJobRole.arn())
.glueVersion("5.0")
.maxRetries(0)
.timeout(2880)
.numberOfWorkers(2)
.workerType("G.1X")
.connections(example.name())
.executionClass("STANDARD")
.command(JobCommandArgs.builder()
.scriptLocation(String.format("s3://%s/jobs/etl_job.py", glueScripts.bucket()))
.name("glueetl")
.pythonVersion("3")
.build())
.notificationProperty(JobNotificationPropertyArgs.builder()
.notifyDelayAfter(3)
.build())
.defaultArguments(Map.ofEntries(
Map.entry("--job-language", "python"),
Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
Map.entry("--enable-continuous-cloudwatch-log", "true"),
Map.entry("--enable-continuous-log-filter", "true"),
Map.entry("--enable-metrics", ""),
Map.entry("--enable-auto-scaling", "true")
))
.executionProperty(JobExecutionPropertyArgs.builder()
.maxConcurrentRuns(1)
.build())
.tags(Map.of("ManagedBy", "AWS"))
.build());
var glueEtlScript = new BucketObjectv2("glueEtlScript", BucketObjectv2Args.builder()
.bucket(glueScripts.id())
.key("jobs/etl_job.py")
.source(new FileAsset("jobs/etl_job.py"))
.build());
}
}
resources:
etlJob:
type: aws:glue:Job
name: etl_job
properties:
name: example-etl-job
description: An example Glue ETL job
roleArn: ${glueJobRole.arn}
glueVersion: '5.0'
maxRetries: 0
timeout: 2880
numberOfWorkers: 2
workerType: G.1X
connections:
- ${example.name}
executionClass: STANDARD
command:
scriptLocation: s3://${glueScripts.bucket}/jobs/etl_job.py
name: glueetl
pythonVersion: '3'
notificationProperty:
notifyDelayAfter: 3
defaultArguments:
--job-language: python
--continuous-log-logGroup: /aws-glue/jobs
--enable-continuous-cloudwatch-log: 'true'
--enable-continuous-log-filter: 'true'
--enable-metrics: ""
--enable-auto-scaling: 'true'
executionProperty:
maxConcurrentRuns: 1
tags:
ManagedBy: AWS
# IAM role for Glue jobs
glueJobRole:
type: aws:iam:Role
name: glue_job_role
properties:
name: glue-job-role
assumeRolePolicy:
fn::toJSON:
Version: 2012-10-17
Statement:
- Action: sts:AssumeRole
Effect: Allow
Principal:
Service: glue.amazonaws.com
glueEtlScript:
type: aws:s3:BucketObjectv2
name: glue_etl_script
properties:
bucket: ${glueScripts.id}
key: jobs/etl_job.py
source:
fn::FileAsset: jobs/etl_job.py
The command block points to your Python script in S3 and sets the job type to glueetl. The workerType and numberOfWorkers properties control compute capacity; G.1X workers provide 4 vCPUs and 16 GB memory each. The defaultArguments property configures AWS Glue behavior, including CloudWatch logging and auto-scaling. Glue manages Spark clusters automatically based on your worker configuration.
Run lightweight Python scripts with shell jobs
When workloads don’t require Spark, Python shell jobs provide a lighter execution environment.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
name: "glue-job-role",
assumeRolePolicy: JSON.stringify({
Version: "2012-10-17",
Statement: [{
Action: "sts:AssumeRole",
Effect: "Allow",
Principal: {
Service: "glue.amazonaws.com",
},
}],
}),
});
const pythonShellJob = new aws.glue.Job("python_shell_job", {
name: "example-python-shell-job",
description: "An example Python shell job",
roleArn: glueJobRole.arn,
maxCapacity: 0.0625,
maxRetries: 0,
timeout: 2880,
connections: [example.name],
command: {
scriptLocation: `s3://${glueScripts.bucket}/jobs/shell_job.py`,
name: "pythonshell",
pythonVersion: "3.9",
},
defaultArguments: {
"--job-language": "python",
"--continuous-log-logGroup": "/aws-glue/jobs",
"--enable-continuous-cloudwatch-log": "true",
"library-set": "analytics",
},
executionProperty: {
maxConcurrentRuns: 1,
},
tags: {
ManagedBy: "AWS",
},
});
const pythonShellScript = new aws.s3.BucketObjectv2("python_shell_script", {
bucket: glueScripts.id,
key: "jobs/shell_job.py",
source: new pulumi.asset.FileAsset("jobs/shell_job.py"),
});
import pulumi
import json
import pulumi_aws as aws
# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
name="glue-job-role",
assume_role_policy=json.dumps({
"Version": "2012-10-17",
"Statement": [{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": {
"Service": "glue.amazonaws.com",
},
}],
}))
python_shell_job = aws.glue.Job("python_shell_job",
name="example-python-shell-job",
description="An example Python shell job",
role_arn=glue_job_role.arn,
max_capacity=0.0625,
max_retries=0,
timeout=2880,
connections=[example["name"]],
command={
"script_location": f"s3://{glue_scripts['bucket']}/jobs/shell_job.py",
"name": "pythonshell",
"python_version": "3.9",
},
default_arguments={
"--job-language": "python",
"--continuous-log-logGroup": "/aws-glue/jobs",
"--enable-continuous-cloudwatch-log": "true",
"library-set": "analytics",
},
execution_property={
"max_concurrent_runs": 1,
},
tags={
"ManagedBy": "AWS",
})
python_shell_script = aws.s3.BucketObjectv2("python_shell_script",
bucket=glue_scripts["id"],
key="jobs/shell_job.py",
source=pulumi.FileAsset("jobs/shell_job.py"))
package main
import (
"encoding/json"
"fmt"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
tmpJSON0, err := json.Marshal(map[string]interface{}{
"Version": "2012-10-17",
"Statement": []map[string]interface{}{
map[string]interface{}{
"Action": "sts:AssumeRole",
"Effect": "Allow",
"Principal": map[string]interface{}{
"Service": "glue.amazonaws.com",
},
},
},
})
if err != nil {
return err
}
json0 := string(tmpJSON0)
// IAM role for Glue jobs
glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
Name: pulumi.String("glue-job-role"),
AssumeRolePolicy: pulumi.String(json0),
})
if err != nil {
return err
}
_, err = glue.NewJob(ctx, "python_shell_job", &glue.JobArgs{
Name: pulumi.String("example-python-shell-job"),
Description: pulumi.String("An example Python shell job"),
RoleArn: glueJobRole.Arn,
MaxCapacity: pulumi.Float64(0.0625),
MaxRetries: pulumi.Int(0),
Timeout: pulumi.Int(2880),
Connections: pulumi.StringArray{
example.Name,
},
Command: &glue.JobCommandArgs{
ScriptLocation: pulumi.Sprintf("s3://%v/jobs/shell_job.py", glueScripts.Bucket),
Name: pulumi.String("pythonshell"),
PythonVersion: pulumi.String("3.9"),
},
DefaultArguments: pulumi.StringMap{
"--job-language": pulumi.String("python"),
"--continuous-log-logGroup": pulumi.String("/aws-glue/jobs"),
"--enable-continuous-cloudwatch-log": pulumi.String("true"),
"library-set": pulumi.String("analytics"),
},
ExecutionProperty: &glue.JobExecutionPropertyArgs{
MaxConcurrentRuns: pulumi.Int(1),
},
Tags: pulumi.StringMap{
"ManagedBy": pulumi.String("AWS"),
},
})
if err != nil {
return err
}
_, err = s3.NewBucketObjectv2(ctx, "python_shell_script", &s3.BucketObjectv2Args{
Bucket: pulumi.Any(glueScripts.Id),
Key: pulumi.String("jobs/shell_job.py"),
Source: pulumi.NewFileAsset("jobs/shell_job.py"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
// IAM role for Glue jobs
var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
{
Name = "glue-job-role",
AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
{
["Version"] = "2012-10-17",
["Statement"] = new[]
{
new Dictionary<string, object?>
{
["Action"] = "sts:AssumeRole",
["Effect"] = "Allow",
["Principal"] = new Dictionary<string, object?>
{
["Service"] = "glue.amazonaws.com",
},
},
},
}),
});
var pythonShellJob = new Aws.Glue.Job("python_shell_job", new()
{
Name = "example-python-shell-job",
Description = "An example Python shell job",
RoleArn = glueJobRole.Arn,
MaxCapacity = 0.0625,
MaxRetries = 0,
Timeout = 2880,
Connections = new[]
{
example.Name,
},
Command = new Aws.Glue.Inputs.JobCommandArgs
{
ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/shell_job.py",
Name = "pythonshell",
PythonVersion = "3.9",
},
DefaultArguments =
{
{ "--job-language", "python" },
{ "--continuous-log-logGroup", "/aws-glue/jobs" },
{ "--enable-continuous-cloudwatch-log", "true" },
{ "library-set", "analytics" },
},
ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
{
MaxConcurrentRuns = 1,
},
Tags =
{
{ "ManagedBy", "AWS" },
},
});
var pythonShellScript = new Aws.S3.BucketObjectv2("python_shell_script", new()
{
Bucket = glueScripts.Id,
Key = "jobs/shell_job.py",
Source = new FileAsset("jobs/shell_job.py"),
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
// IAM role for Glue jobs
var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
.name("glue-job-role")
.assumeRolePolicy(serializeJson(
jsonObject(
jsonProperty("Version", "2012-10-17"),
jsonProperty("Statement", jsonArray(jsonObject(
jsonProperty("Action", "sts:AssumeRole"),
jsonProperty("Effect", "Allow"),
jsonProperty("Principal", jsonObject(
jsonProperty("Service", "glue.amazonaws.com")
))
)))
)))
.build());
var pythonShellJob = new Job("pythonShellJob", JobArgs.builder()
.name("example-python-shell-job")
.description("An example Python shell job")
.roleArn(glueJobRole.arn())
.maxCapacity(0.0625)
.maxRetries(0)
.timeout(2880)
.connections(example.name())
.command(JobCommandArgs.builder()
.scriptLocation(String.format("s3://%s/jobs/shell_job.py", glueScripts.bucket()))
.name("pythonshell")
.pythonVersion("3.9")
.build())
.defaultArguments(Map.ofEntries(
Map.entry("--job-language", "python"),
Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
Map.entry("--enable-continuous-cloudwatch-log", "true"),
Map.entry("library-set", "analytics")
))
.executionProperty(JobExecutionPropertyArgs.builder()
.maxConcurrentRuns(1)
.build())
.tags(Map.of("ManagedBy", "AWS"))
.build());
var pythonShellScript = new BucketObjectv2("pythonShellScript", BucketObjectv2Args.builder()
.bucket(glueScripts.id())
.key("jobs/shell_job.py")
.source(new FileAsset("jobs/shell_job.py"))
.build());
}
}
resources:
pythonShellJob:
type: aws:glue:Job
name: python_shell_job
properties:
name: example-python-shell-job
description: An example Python shell job
roleArn: ${glueJobRole.arn}
maxCapacity: '0.0625'
maxRetries: 0
timeout: 2880
connections:
- ${example.name}
command:
scriptLocation: s3://${glueScripts.bucket}/jobs/shell_job.py
name: pythonshell
pythonVersion: '3.9'
defaultArguments:
--job-language: python
--continuous-log-logGroup: /aws-glue/jobs
--enable-continuous-cloudwatch-log: 'true'
library-set: analytics
executionProperty:
maxConcurrentRuns: 1
tags:
ManagedBy: AWS
# IAM role for Glue jobs
glueJobRole:
type: aws:iam:Role
name: glue_job_role
properties:
name: glue-job-role
assumeRolePolicy:
fn::toJSON:
Version: 2012-10-17
Statement:
- Action: sts:AssumeRole
Effect: Allow
Principal:
Service: glue.amazonaws.com
pythonShellScript:
type: aws:s3:BucketObjectv2
name: python_shell_script
properties:
bucket: ${glueScripts.id}
key: jobs/shell_job.py
source:
fn::FileAsset: jobs/shell_job.py
The command name changes to pythonshell, which runs Python without Spark overhead. The maxCapacity property replaces workerType and numberOfWorkers; shell jobs accept either 0.0625 or 1.0 DPUs. The pythonVersion property specifies the Python runtime (3.9 in this case). Shell jobs are ideal for API calls, simple transformations, or orchestration logic.
Process continuous data with streaming jobs
Pipelines consuming from Kinesis or Kafka need streaming jobs that run continuously.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Job("example", {
name: "example streaming job",
roleArn: exampleAwsIamRole.arn,
command: {
name: "gluestreaming",
scriptLocation: `s3://${exampleAwsS3Bucket.bucket}/example.script`,
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Job("example",
name="example streaming job",
role_arn=example_aws_iam_role["arn"],
command={
"name": "gluestreaming",
"script_location": f"s3://{example_aws_s3_bucket['bucket']}/example.script",
})
package main
import (
"fmt"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewJob(ctx, "example", &glue.JobArgs{
Name: pulumi.String("example streaming job"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Command: &glue.JobCommandArgs{
Name: pulumi.String("gluestreaming"),
ScriptLocation: pulumi.Sprintf("s3://%v/example.script", exampleAwsS3Bucket.Bucket),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Job("example", new()
{
Name = "example streaming job",
RoleArn = exampleAwsIamRole.Arn,
Command = new Aws.Glue.Inputs.JobCommandArgs
{
Name = "gluestreaming",
ScriptLocation = $"s3://{exampleAwsS3Bucket.Bucket}/example.script",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Job("example", JobArgs.builder()
.name("example streaming job")
.roleArn(exampleAwsIamRole.arn())
.command(JobCommandArgs.builder()
.name("gluestreaming")
.scriptLocation(String.format("s3://%s/example.script", exampleAwsS3Bucket.bucket()))
.build())
.build());
}
}
resources:
example:
type: aws:glue:Job
properties:
name: example streaming job
roleArn: ${exampleAwsIamRole.arn}
command:
name: gluestreaming
scriptLocation: s3://${exampleAwsS3Bucket.bucket}/example.script
The command name gluestreaming indicates a long-running job that processes data streams. Streaming jobs have no timeout by default (they run until stopped). Your script must implement continuous reading from sources like Kinesis Data Streams or Apache Kafka.
Enable CloudWatch logging and metrics
Production jobs require visibility into execution progress and errors.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.cloudwatch.LogGroup("example", {
name: "example",
retentionInDays: 14,
});
const exampleJob = new aws.glue.Job("example", {defaultArguments: {
"--continuous-log-logGroup": example.name,
"--enable-continuous-cloudwatch-log": "true",
"--enable-continuous-log-filter": "true",
"--enable-metrics": "",
}});
import pulumi
import pulumi_aws as aws
example = aws.cloudwatch.LogGroup("example",
name="example",
retention_in_days=14)
example_job = aws.glue.Job("example", default_arguments={
"--continuous-log-logGroup": example.name,
"--enable-continuous-cloudwatch-log": "true",
"--enable-continuous-log-filter": "true",
"--enable-metrics": "",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/cloudwatch"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
example, err := cloudwatch.NewLogGroup(ctx, "example", &cloudwatch.LogGroupArgs{
Name: pulumi.String("example"),
RetentionInDays: pulumi.Int(14),
})
if err != nil {
return err
}
_, err = glue.NewJob(ctx, "example", &glue.JobArgs{
DefaultArguments: pulumi.StringMap{
"--continuous-log-logGroup": example.Name,
"--enable-continuous-cloudwatch-log": pulumi.String("true"),
"--enable-continuous-log-filter": pulumi.String("true"),
"--enable-metrics": pulumi.String(""),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.CloudWatch.LogGroup("example", new()
{
Name = "example",
RetentionInDays = 14,
});
var exampleJob = new Aws.Glue.Job("example", new()
{
DefaultArguments =
{
{ "--continuous-log-logGroup", example.Name },
{ "--enable-continuous-cloudwatch-log", "true" },
{ "--enable-continuous-log-filter", "true" },
{ "--enable-metrics", "" },
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.cloudwatch.LogGroup;
import com.pulumi.aws.cloudwatch.LogGroupArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new LogGroup("example", LogGroupArgs.builder()
.name("example")
.retentionInDays(14)
.build());
var exampleJob = new Job("exampleJob", JobArgs.builder()
.defaultArguments(Map.ofEntries(
Map.entry("--continuous-log-logGroup", example.name()),
Map.entry("--enable-continuous-cloudwatch-log", "true"),
Map.entry("--enable-continuous-log-filter", "true"),
Map.entry("--enable-metrics", "")
))
.build());
}
}
resources:
example:
type: aws:cloudwatch:LogGroup
properties:
name: example
retentionInDays: 14
exampleJob:
type: aws:glue:Job
name: example
properties:
defaultArguments:
--continuous-log-logGroup: ${example.name}
--enable-continuous-cloudwatch-log: 'true'
--enable-continuous-log-filter: 'true'
--enable-metrics: ""
The defaultArguments property controls CloudWatch integration. Setting enable-continuous-cloudwatch-log to “true” streams logs to the specified log group. The enable-metrics flag (empty string value) activates CloudWatch metrics for job monitoring. The enable-continuous-log-filter flag reduces log volume by filtering out verbose Spark messages.
Beyond these examples
These snippets focus on specific job-level features: Python ETL and shell job execution, streaming job configuration, and CloudWatch logging and metrics. They’re intentionally minimal rather than full data pipelines.
The examples may reference pre-existing infrastructure such as IAM execution roles with Glue service trust, S3 buckets for script storage, and Glue connections for database access. They focus on configuring the job rather than provisioning everything around it.
To keep things focused, common job patterns are omitted, including:
- Job scheduling and triggers (separate resources)
- Security configurations for encryption
- Retry and timeout tuning (maxRetries, timeout)
- Execution class selection (FLEX vs STANDARD)
- Job run queuing (jobRunQueuingEnabled)
- Ray and Scala job configurations
These omissions are intentional: the goal is to illustrate how each job feature is wired, not provide drop-in ETL pipelines. See the Glue Job resource reference for all available configuration options.
Let's create AWS Glue ETL Jobs
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Job Configuration & Types
glueVersion to 4.0 or greater, workerType to Z.2X, and configure command.name as glueray with command.runtime (e.g., Ray2.4).command.name. Use glueetl for ETL jobs, pythonshell for Python scripts, gluestreaming for streaming jobs, and glueray for Ray-based jobs.Standard, G.1X, G.2X, G.025X, G.4X, G.8X, G.12X, G.16X, R.1X, R.2X, R.4X, R.8X, and Z.2X. Use Z.2X specifically for Ray jobs.Resource Allocation
numberOfWorkers and workerType instead of maxCapacity. Only use maxCapacity for pythonshell jobs, where it accepts either 0.0625 or 1.0.maxCapacity set to either 0.0625 or 1.0.Monitoring & Logging
defaultArguments with --enable-continuous-cloudwatch-log: "true", --continuous-log-logGroup (log group name), --enable-continuous-log-filter: "true", and --enable-metrics: "".defaultArguments property using AWS Glue special parameters.Timeouts & Execution
glueetl and pythonshell jobs, 0 (unlimited) for gluestreaming jobs. Leave timeout unconfigured for glueray jobs.STANDARD execution class is ideal for time-sensitive workloads that require fast job startup and dedicated resources. FLEX provides more flexible execution.name property is immutable and cannot be changed after the job is created.Using a different cloud?
Explore analytics guides for other cloud providers: