Create AWS Glue ETL Jobs

The aws:glue/job:Job resource, part of the Pulumi AWS provider, defines a Glue job: its script location, execution environment (Python ETL, Python shell, streaming, Ray, or Scala), and compute resources. This guide focuses on four capabilities: Python ETL jobs with worker scaling, Python shell jobs for lightweight processing, streaming jobs for continuous data, and CloudWatch logging and metrics.

Glue jobs require IAM execution roles, S3-hosted scripts, and optionally Glue connections for database access. The examples are intentionally small. Combine them with your own IAM policies, S3 buckets, and data sources.

Run Python ETL jobs with worker scaling

Most Glue deployments transform data with Python scripts, configuring worker types and counts to balance cost and performance.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
    name: "glue-job-role",
    assumeRolePolicy: JSON.stringify({
        Version: "2012-10-17",
        Statement: [{
            Action: "sts:AssumeRole",
            Effect: "Allow",
            Principal: {
                Service: "glue.amazonaws.com",
            },
        }],
    }),
});
const etlJob = new aws.glue.Job("etl_job", {
    name: "example-etl-job",
    description: "An example Glue ETL job",
    roleArn: glueJobRole.arn,
    glueVersion: "5.0",
    maxRetries: 0,
    timeout: 2880,
    numberOfWorkers: 2,
    workerType: "G.1X",
    connections: [example.name],
    executionClass: "STANDARD",
    command: {
        scriptLocation: `s3://${glueScripts.bucket}/jobs/etl_job.py`,
        name: "glueetl",
        pythonVersion: "3",
    },
    notificationProperty: {
        notifyDelayAfter: 3,
    },
    defaultArguments: {
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "--enable-continuous-log-filter": "true",
        "--enable-metrics": "",
        "--enable-auto-scaling": "true",
    },
    executionProperty: {
        maxConcurrentRuns: 1,
    },
    tags: {
        ManagedBy: "AWS",
    },
});
const glueEtlScript = new aws.s3.BucketObjectv2("glue_etl_script", {
    bucket: glueScripts.id,
    key: "jobs/etl_job.py",
    source: new pulumi.asset.FileAsset("jobs/etl_job.py"),
});

import pulumi
import json
import pulumi_aws as aws

# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
    name="glue-job-role",
    assume_role_policy=json.dumps({
        "Version": "2012-10-17",
        "Statement": [{
            "Action": "sts:AssumeRole",
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com",
            },
        }],
    }))
etl_job = aws.glue.Job("etl_job",
    name="example-etl-job",
    description="An example Glue ETL job",
    role_arn=glue_job_role.arn,
    glue_version="5.0",
    max_retries=0,
    timeout=2880,
    number_of_workers=2,
    worker_type="G.1X",
    connections=[example["name"]],
    execution_class="STANDARD",
    command={
        "script_location": f"s3://{glue_scripts['bucket']}/jobs/etl_job.py",
        "name": "glueetl",
        "python_version": "3",
    },
    notification_property={
        "notify_delay_after": 3,
    },
    default_arguments={
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "--enable-continuous-log-filter": "true",
        "--enable-metrics": "",
        "--enable-auto-scaling": "true",
    },
    execution_property={
        "max_concurrent_runs": 1,
    },
    tags={
        "ManagedBy": "AWS",
    })
glue_etl_script = aws.s3.BucketObjectv2("glue_etl_script",
    bucket=glue_scripts["id"],
    key="jobs/etl_job.py",
    source=pulumi.FileAsset("jobs/etl_job.py"))

package main

import (
	"encoding/json"
	"fmt"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Version": "2012-10-17",
			"Statement": []map[string]interface{}{
				map[string]interface{}{
					"Action": "sts:AssumeRole",
					"Effect": "Allow",
					"Principal": map[string]interface{}{
						"Service": "glue.amazonaws.com",
					},
				},
			},
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		// IAM role for Glue jobs
		glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
			Name:             pulumi.String("glue-job-role"),
			AssumeRolePolicy: pulumi.String(json0),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "etl_job", &glue.JobArgs{
			Name:            pulumi.String("example-etl-job"),
			Description:     pulumi.String("An example Glue ETL job"),
			RoleArn:         glueJobRole.Arn,
			GlueVersion:     pulumi.String("5.0"),
			MaxRetries:      pulumi.Int(0),
			Timeout:         pulumi.Int(2880),
			NumberOfWorkers: pulumi.Int(2),
			WorkerType:      pulumi.String("G.1X"),
			Connections: pulumi.StringArray{
				example.Name,
			},
			ExecutionClass: pulumi.String("STANDARD"),
			Command: &glue.JobCommandArgs{
				ScriptLocation: pulumi.Sprintf("s3://%v/jobs/etl_job.py", glueScripts.Bucket),
				Name:           pulumi.String("glueetl"),
				PythonVersion:  pulumi.String("3"),
			},
			NotificationProperty: &glue.JobNotificationPropertyArgs{
				NotifyDelayAfter: pulumi.Int(3),
			},
			DefaultArguments: pulumi.StringMap{
				"--job-language":                     pulumi.String("python"),
				"--continuous-log-logGroup":          pulumi.String("/aws-glue/jobs"),
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"--enable-continuous-log-filter":     pulumi.String("true"),
				"--enable-metrics":                   pulumi.String(""),
				"--enable-auto-scaling":              pulumi.String("true"),
			},
			ExecutionProperty: &glue.JobExecutionPropertyArgs{
				MaxConcurrentRuns: pulumi.Int(1),
			},
			Tags: pulumi.StringMap{
				"ManagedBy": pulumi.String("AWS"),
			},
		})
		if err != nil {
			return err
		}
		_, err = s3.NewBucketObjectv2(ctx, "glue_etl_script", &s3.BucketObjectv2Args{
			Bucket: pulumi.Any(glueScripts.Id),
			Key:    pulumi.String("jobs/etl_job.py"),
			Source: pulumi.NewFileAsset("jobs/etl_job.py"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    // IAM role for Glue jobs
    var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
    {
        Name = "glue-job-role",
        AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Version"] = "2012-10-17",
            ["Statement"] = new[]
            {
                new Dictionary<string, object?>
                {
                    ["Action"] = "sts:AssumeRole",
                    ["Effect"] = "Allow",
                    ["Principal"] = new Dictionary<string, object?>
                    {
                        ["Service"] = "glue.amazonaws.com",
                    },
                },
            },
        }),
    });

    var etlJob = new Aws.Glue.Job("etl_job", new()
    {
        Name = "example-etl-job",
        Description = "An example Glue ETL job",
        RoleArn = glueJobRole.Arn,
        GlueVersion = "5.0",
        MaxRetries = 0,
        Timeout = 2880,
        NumberOfWorkers = 2,
        WorkerType = "G.1X",
        Connections = new[]
        {
            example.Name,
        },
        ExecutionClass = "STANDARD",
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/etl_job.py",
            Name = "glueetl",
            PythonVersion = "3",
        },
        NotificationProperty = new Aws.Glue.Inputs.JobNotificationPropertyArgs
        {
            NotifyDelayAfter = 3,
        },
        DefaultArguments = 
        {
            { "--job-language", "python" },
            { "--continuous-log-logGroup", "/aws-glue/jobs" },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "--enable-continuous-log-filter", "true" },
            { "--enable-metrics", "" },
            { "--enable-auto-scaling", "true" },
        },
        ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
        {
            MaxConcurrentRuns = 1,
        },
        Tags = 
        {
            { "ManagedBy", "AWS" },
        },
    });

    var glueEtlScript = new Aws.S3.BucketObjectv2("glue_etl_script", new()
    {
        Bucket = glueScripts.Id,
        Key = "jobs/etl_job.py",
        Source = new FileAsset("jobs/etl_job.py"),
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobNotificationPropertyArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        // IAM role for Glue jobs
        var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
            .name("glue-job-role")
            .assumeRolePolicy(serializeJson(
                jsonObject(
                    jsonProperty("Version", "2012-10-17"),
                    jsonProperty("Statement", jsonArray(jsonObject(
                        jsonProperty("Action", "sts:AssumeRole"),
                        jsonProperty("Effect", "Allow"),
                        jsonProperty("Principal", jsonObject(
                            jsonProperty("Service", "glue.amazonaws.com")
                        ))
                    )))
                )))
            .build());

        var etlJob = new Job("etlJob", JobArgs.builder()
            .name("example-etl-job")
            .description("An example Glue ETL job")
            .roleArn(glueJobRole.arn())
            .glueVersion("5.0")
            .maxRetries(0)
            .timeout(2880)
            .numberOfWorkers(2)
            .workerType("G.1X")
            .connections(example.name())
            .executionClass("STANDARD")
            .command(JobCommandArgs.builder()
                .scriptLocation(String.format("s3://%s/jobs/etl_job.py", glueScripts.bucket()))
                .name("glueetl")
                .pythonVersion("3")
                .build())
            .notificationProperty(JobNotificationPropertyArgs.builder()
                .notifyDelayAfter(3)
                .build())
            .defaultArguments(Map.ofEntries(
                Map.entry("--job-language", "python"),
                Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("--enable-continuous-log-filter", "true"),
                Map.entry("--enable-metrics", ""),
                Map.entry("--enable-auto-scaling", "true")
            ))
            .executionProperty(JobExecutionPropertyArgs.builder()
                .maxConcurrentRuns(1)
                .build())
            .tags(Map.of("ManagedBy", "AWS"))
            .build());

        var glueEtlScript = new BucketObjectv2("glueEtlScript", BucketObjectv2Args.builder()
            .bucket(glueScripts.id())
            .key("jobs/etl_job.py")
            .source(new FileAsset("jobs/etl_job.py"))
            .build());

    }
}

resources:
  etlJob:
    type: aws:glue:Job
    name: etl_job
    properties:
      name: example-etl-job
      description: An example Glue ETL job
      roleArn: ${glueJobRole.arn}
      glueVersion: '5.0'
      maxRetries: 0
      timeout: 2880
      numberOfWorkers: 2
      workerType: G.1X
      connections:
        - ${example.name}
      executionClass: STANDARD
      command:
        scriptLocation: s3://${glueScripts.bucket}/jobs/etl_job.py
        name: glueetl
        pythonVersion: '3'
      notificationProperty:
        notifyDelayAfter: 3
      defaultArguments:
        --job-language: python
        --continuous-log-logGroup: /aws-glue/jobs
        --enable-continuous-cloudwatch-log: 'true'
        --enable-continuous-log-filter: 'true'
        --enable-metrics: ""
        --enable-auto-scaling: 'true'
      executionProperty:
        maxConcurrentRuns: 1
      tags:
        ManagedBy: AWS
  # IAM role for Glue jobs
  glueJobRole:
    type: aws:iam:Role
    name: glue_job_role
    properties:
      name: glue-job-role
      assumeRolePolicy:
        fn::toJSON:
          Version: 2012-10-17
          Statement:
            - Action: sts:AssumeRole
              Effect: Allow
              Principal:
                Service: glue.amazonaws.com
  glueEtlScript:
    type: aws:s3:BucketObjectv2
    name: glue_etl_script
    properties:
      bucket: ${glueScripts.id}
      key: jobs/etl_job.py
      source:
        fn::FileAsset: jobs/etl_job.py

The command block points to your Python script in S3 and sets the job type to glueetl. The workerType and numberOfWorkers properties control compute capacity; G.1X workers provide 4 vCPUs and 16 GB memory each. The defaultArguments property configures AWS Glue behavior, including CloudWatch logging and auto-scaling. Glue manages Spark clusters automatically based on your worker configuration.

Run lightweight Python scripts with shell jobs

When workloads don’t require Spark, Python shell jobs provide a lighter execution environment.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
    name: "glue-job-role",
    assumeRolePolicy: JSON.stringify({
        Version: "2012-10-17",
        Statement: [{
            Action: "sts:AssumeRole",
            Effect: "Allow",
            Principal: {
                Service: "glue.amazonaws.com",
            },
        }],
    }),
});
const pythonShellJob = new aws.glue.Job("python_shell_job", {
    name: "example-python-shell-job",
    description: "An example Python shell job",
    roleArn: glueJobRole.arn,
    maxCapacity: 0.0625,
    maxRetries: 0,
    timeout: 2880,
    connections: [example.name],
    command: {
        scriptLocation: `s3://${glueScripts.bucket}/jobs/shell_job.py`,
        name: "pythonshell",
        pythonVersion: "3.9",
    },
    defaultArguments: {
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "library-set": "analytics",
    },
    executionProperty: {
        maxConcurrentRuns: 1,
    },
    tags: {
        ManagedBy: "AWS",
    },
});
const pythonShellScript = new aws.s3.BucketObjectv2("python_shell_script", {
    bucket: glueScripts.id,
    key: "jobs/shell_job.py",
    source: new pulumi.asset.FileAsset("jobs/shell_job.py"),
});

import pulumi
import json
import pulumi_aws as aws

# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
    name="glue-job-role",
    assume_role_policy=json.dumps({
        "Version": "2012-10-17",
        "Statement": [{
            "Action": "sts:AssumeRole",
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com",
            },
        }],
    }))
python_shell_job = aws.glue.Job("python_shell_job",
    name="example-python-shell-job",
    description="An example Python shell job",
    role_arn=glue_job_role.arn,
    max_capacity=0.0625,
    max_retries=0,
    timeout=2880,
    connections=[example["name"]],
    command={
        "script_location": f"s3://{glue_scripts['bucket']}/jobs/shell_job.py",
        "name": "pythonshell",
        "python_version": "3.9",
    },
    default_arguments={
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "library-set": "analytics",
    },
    execution_property={
        "max_concurrent_runs": 1,
    },
    tags={
        "ManagedBy": "AWS",
    })
python_shell_script = aws.s3.BucketObjectv2("python_shell_script",
    bucket=glue_scripts["id"],
    key="jobs/shell_job.py",
    source=pulumi.FileAsset("jobs/shell_job.py"))

package main

import (
	"encoding/json"
	"fmt"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Version": "2012-10-17",
			"Statement": []map[string]interface{}{
				map[string]interface{}{
					"Action": "sts:AssumeRole",
					"Effect": "Allow",
					"Principal": map[string]interface{}{
						"Service": "glue.amazonaws.com",
					},
				},
			},
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		// IAM role for Glue jobs
		glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
			Name:             pulumi.String("glue-job-role"),
			AssumeRolePolicy: pulumi.String(json0),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "python_shell_job", &glue.JobArgs{
			Name:        pulumi.String("example-python-shell-job"),
			Description: pulumi.String("An example Python shell job"),
			RoleArn:     glueJobRole.Arn,
			MaxCapacity: pulumi.Float64(0.0625),
			MaxRetries:  pulumi.Int(0),
			Timeout:     pulumi.Int(2880),
			Connections: pulumi.StringArray{
				example.Name,
			},
			Command: &glue.JobCommandArgs{
				ScriptLocation: pulumi.Sprintf("s3://%v/jobs/shell_job.py", glueScripts.Bucket),
				Name:           pulumi.String("pythonshell"),
				PythonVersion:  pulumi.String("3.9"),
			},
			DefaultArguments: pulumi.StringMap{
				"--job-language":                     pulumi.String("python"),
				"--continuous-log-logGroup":          pulumi.String("/aws-glue/jobs"),
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"library-set":                        pulumi.String("analytics"),
			},
			ExecutionProperty: &glue.JobExecutionPropertyArgs{
				MaxConcurrentRuns: pulumi.Int(1),
			},
			Tags: pulumi.StringMap{
				"ManagedBy": pulumi.String("AWS"),
			},
		})
		if err != nil {
			return err
		}
		_, err = s3.NewBucketObjectv2(ctx, "python_shell_script", &s3.BucketObjectv2Args{
			Bucket: pulumi.Any(glueScripts.Id),
			Key:    pulumi.String("jobs/shell_job.py"),
			Source: pulumi.NewFileAsset("jobs/shell_job.py"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    // IAM role for Glue jobs
    var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
    {
        Name = "glue-job-role",
        AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Version"] = "2012-10-17",
            ["Statement"] = new[]
            {
                new Dictionary<string, object?>
                {
                    ["Action"] = "sts:AssumeRole",
                    ["Effect"] = "Allow",
                    ["Principal"] = new Dictionary<string, object?>
                    {
                        ["Service"] = "glue.amazonaws.com",
                    },
                },
            },
        }),
    });

    var pythonShellJob = new Aws.Glue.Job("python_shell_job", new()
    {
        Name = "example-python-shell-job",
        Description = "An example Python shell job",
        RoleArn = glueJobRole.Arn,
        MaxCapacity = 0.0625,
        MaxRetries = 0,
        Timeout = 2880,
        Connections = new[]
        {
            example.Name,
        },
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/shell_job.py",
            Name = "pythonshell",
            PythonVersion = "3.9",
        },
        DefaultArguments = 
        {
            { "--job-language", "python" },
            { "--continuous-log-logGroup", "/aws-glue/jobs" },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "library-set", "analytics" },
        },
        ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
        {
            MaxConcurrentRuns = 1,
        },
        Tags = 
        {
            { "ManagedBy", "AWS" },
        },
    });

    var pythonShellScript = new Aws.S3.BucketObjectv2("python_shell_script", new()
    {
        Bucket = glueScripts.Id,
        Key = "jobs/shell_job.py",
        Source = new FileAsset("jobs/shell_job.py"),
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        // IAM role for Glue jobs
        var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
            .name("glue-job-role")
            .assumeRolePolicy(serializeJson(
                jsonObject(
                    jsonProperty("Version", "2012-10-17"),
                    jsonProperty("Statement", jsonArray(jsonObject(
                        jsonProperty("Action", "sts:AssumeRole"),
                        jsonProperty("Effect", "Allow"),
                        jsonProperty("Principal", jsonObject(
                            jsonProperty("Service", "glue.amazonaws.com")
                        ))
                    )))
                )))
            .build());

        var pythonShellJob = new Job("pythonShellJob", JobArgs.builder()
            .name("example-python-shell-job")
            .description("An example Python shell job")
            .roleArn(glueJobRole.arn())
            .maxCapacity(0.0625)
            .maxRetries(0)
            .timeout(2880)
            .connections(example.name())
            .command(JobCommandArgs.builder()
                .scriptLocation(String.format("s3://%s/jobs/shell_job.py", glueScripts.bucket()))
                .name("pythonshell")
                .pythonVersion("3.9")
                .build())
            .defaultArguments(Map.ofEntries(
                Map.entry("--job-language", "python"),
                Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("library-set", "analytics")
            ))
            .executionProperty(JobExecutionPropertyArgs.builder()
                .maxConcurrentRuns(1)
                .build())
            .tags(Map.of("ManagedBy", "AWS"))
            .build());

        var pythonShellScript = new BucketObjectv2("pythonShellScript", BucketObjectv2Args.builder()
            .bucket(glueScripts.id())
            .key("jobs/shell_job.py")
            .source(new FileAsset("jobs/shell_job.py"))
            .build());

    }
}

resources:
  pythonShellJob:
    type: aws:glue:Job
    name: python_shell_job
    properties:
      name: example-python-shell-job
      description: An example Python shell job
      roleArn: ${glueJobRole.arn}
      maxCapacity: '0.0625'
      maxRetries: 0
      timeout: 2880
      connections:
        - ${example.name}
      command:
        scriptLocation: s3://${glueScripts.bucket}/jobs/shell_job.py
        name: pythonshell
        pythonVersion: '3.9'
      defaultArguments:
        --job-language: python
        --continuous-log-logGroup: /aws-glue/jobs
        --enable-continuous-cloudwatch-log: 'true'
        library-set: analytics
      executionProperty:
        maxConcurrentRuns: 1
      tags:
        ManagedBy: AWS
  # IAM role for Glue jobs
  glueJobRole:
    type: aws:iam:Role
    name: glue_job_role
    properties:
      name: glue-job-role
      assumeRolePolicy:
        fn::toJSON:
          Version: 2012-10-17
          Statement:
            - Action: sts:AssumeRole
              Effect: Allow
              Principal:
                Service: glue.amazonaws.com
  pythonShellScript:
    type: aws:s3:BucketObjectv2
    name: python_shell_script
    properties:
      bucket: ${glueScripts.id}
      key: jobs/shell_job.py
      source:
        fn::FileAsset: jobs/shell_job.py

The command name changes to pythonshell, which runs Python without Spark overhead. The maxCapacity property replaces workerType and numberOfWorkers; shell jobs accept either 0.0625 or 1.0 DPUs. The pythonVersion property specifies the Python runtime (3.9 in this case). Shell jobs are ideal for API calls, simple transformations, or orchestration logic.

Process continuous data with streaming jobs

Pipelines consuming from Kinesis or Kafka need streaming jobs that run continuously.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Job("example", {
    name: "example streaming job",
    roleArn: exampleAwsIamRole.arn,
    command: {
        name: "gluestreaming",
        scriptLocation: `s3://${exampleAwsS3Bucket.bucket}/example.script`,
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Job("example",
    name="example streaming job",
    role_arn=example_aws_iam_role["arn"],
    command={
        "name": "gluestreaming",
        "script_location": f"s3://{example_aws_s3_bucket['bucket']}/example.script",
    })

package main

import (
	"fmt"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewJob(ctx, "example", &glue.JobArgs{
			Name:    pulumi.String("example streaming job"),
			RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
			Command: &glue.JobCommandArgs{
				Name:           pulumi.String("gluestreaming"),
				ScriptLocation: pulumi.Sprintf("s3://%v/example.script", exampleAwsS3Bucket.Bucket),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Job("example", new()
    {
        Name = "example streaming job",
        RoleArn = exampleAwsIamRole.Arn,
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            Name = "gluestreaming",
            ScriptLocation = $"s3://{exampleAwsS3Bucket.Bucket}/example.script",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Job("example", JobArgs.builder()
            .name("example streaming job")
            .roleArn(exampleAwsIamRole.arn())
            .command(JobCommandArgs.builder()
                .name("gluestreaming")
                .scriptLocation(String.format("s3://%s/example.script", exampleAwsS3Bucket.bucket()))
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Job
    properties:
      name: example streaming job
      roleArn: ${exampleAwsIamRole.arn}
      command:
        name: gluestreaming
        scriptLocation: s3://${exampleAwsS3Bucket.bucket}/example.script

The command name gluestreaming indicates a long-running job that processes data streams. Streaming jobs have no timeout by default (they run until stopped). Your script must implement continuous reading from sources like Kinesis Data Streams or Apache Kafka.

Enable CloudWatch logging and metrics

Production jobs require visibility into execution progress and errors.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.cloudwatch.LogGroup("example", {
    name: "example",
    retentionInDays: 14,
});
const exampleJob = new aws.glue.Job("example", {defaultArguments: {
    "--continuous-log-logGroup": example.name,
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
    "--enable-metrics": "",
}});

import pulumi
import pulumi_aws as aws

example = aws.cloudwatch.LogGroup("example",
    name="example",
    retention_in_days=14)
example_job = aws.glue.Job("example", default_arguments={
    "--continuous-log-logGroup": example.name,
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
    "--enable-metrics": "",
})

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/cloudwatch"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		example, err := cloudwatch.NewLogGroup(ctx, "example", &cloudwatch.LogGroupArgs{
			Name:            pulumi.String("example"),
			RetentionInDays: pulumi.Int(14),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "example", &glue.JobArgs{
			DefaultArguments: pulumi.StringMap{
				"--continuous-log-logGroup":          example.Name,
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"--enable-continuous-log-filter":     pulumi.String("true"),
				"--enable-metrics":                   pulumi.String(""),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.CloudWatch.LogGroup("example", new()
    {
        Name = "example",
        RetentionInDays = 14,
    });

    var exampleJob = new Aws.Glue.Job("example", new()
    {
        DefaultArguments = 
        {
            { "--continuous-log-logGroup", example.Name },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "--enable-continuous-log-filter", "true" },
            { "--enable-metrics", "" },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.cloudwatch.LogGroup;
import com.pulumi.aws.cloudwatch.LogGroupArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new LogGroup("example", LogGroupArgs.builder()
            .name("example")
            .retentionInDays(14)
            .build());

        var exampleJob = new Job("exampleJob", JobArgs.builder()
            .defaultArguments(Map.ofEntries(
                Map.entry("--continuous-log-logGroup", example.name()),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("--enable-continuous-log-filter", "true"),
                Map.entry("--enable-metrics", "")
            ))
            .build());

    }
}

resources:
  example:
    type: aws:cloudwatch:LogGroup
    properties:
      name: example
      retentionInDays: 14
  exampleJob:
    type: aws:glue:Job
    name: example
    properties:
      defaultArguments:
        --continuous-log-logGroup: ${example.name}
        --enable-continuous-cloudwatch-log: 'true'
        --enable-continuous-log-filter: 'true'
        --enable-metrics: ""

The defaultArguments property controls CloudWatch integration. Setting enable-continuous-cloudwatch-log to “true” streams logs to the specified log group. The enable-metrics flag (empty string value) activates CloudWatch metrics for job monitoring. The enable-continuous-log-filter flag reduces log volume by filtering out verbose Spark messages.

Beyond these examples

These snippets focus on specific job-level features: Python ETL and shell job execution, streaming job configuration, and CloudWatch logging and metrics. They’re intentionally minimal rather than full data pipelines.

The examples may reference pre-existing infrastructure such as IAM execution roles with Glue service trust, S3 buckets for script storage, and Glue connections for database access. They focus on configuring the job rather than provisioning everything around it.

To keep things focused, common job patterns are omitted, including:

Job scheduling and triggers (separate resources)
Security configurations for encryption
Retry and timeout tuning (maxRetries, timeout)
Execution class selection (FLEX vs STANDARD)
Job run queuing (jobRunQueuingEnabled)
Ray and Scala job configurations

These omissions are intentional: the goal is to illustrate how each job feature is wired, not provide drop-in ETL pipelines. See the Glue Job resource reference for all available configuration options.

Let's create AWS Glue ETL Jobs

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Job Configuration & Types

How do I create a Ray job in AWS Glue?

Set glueVersion to 4.0 or greater, workerType to Z.2X, and configure command.name as glueray with command.runtime (e.g., Ray2.4).

What's the difference between glueetl, pythonshell, gluestreaming, and glueray job types?

These are different job command types specified in command.name. Use glueetl for ETL jobs, pythonshell for Python scripts, gluestreaming for streaming jobs, and glueray for Ray-based jobs.

What worker types are available and when should I use Z.2X?

Valid worker types include Standard, G.1X, G.2X, G.025X, G.4X, G.8X, G.12X, G.16X, R.1X, R.2X, R.4X, R.8X, and Z.2X. Use Z.2X specifically for Ray jobs.

Resource Allocation

Should I use maxCapacity or numberOfWorkers and workerType?

For Glue version 2.0 and above, use numberOfWorkers and workerType instead of maxCapacity. Only use maxCapacity for pythonshell jobs, where it accepts either 0.0625 or 1.0.

What maxCapacity values are valid for pythonshell jobs?

Pythonshell jobs require maxCapacity set to either 0.0625 or 1.0.

Monitoring & Logging

How do I enable CloudWatch logging and metrics for my Glue job?

Configure defaultArguments with --enable-continuous-cloudwatch-log: "true", --continuous-log-logGroup (log group name), --enable-continuous-log-filter: "true", and --enable-metrics: "".

Where do I configure job monitoring and logging settings?

Glue job monitoring and logging are typically managed through the defaultArguments property using AWS Glue special parameters.

Timeouts & Execution

What are the default timeout values for different Glue job types?

The default timeout is 2880 minutes (48 hours) for glueetl and pythonshell jobs, 0 (unlimited) for gluestreaming jobs. Leave timeout unconfigured for glueray jobs.

What's the difference between FLEX and STANDARD execution classes?

The STANDARD execution class is ideal for time-sensitive workloads that require fast job startup and dedicated resources. FLEX provides more flexible execution.

Can I rename my Glue job after creation?

No, the name property is immutable and cannot be changed after the job is created.

Using a different cloud?

Explore analytics guides for other cloud providers:

Azure Guides GCP Guides