Create AWS Glue ETL Jobs

The aws:glue/job:Job resource, part of the Pulumi AWS provider, defines a Glue job: its script location, execution environment (Spark, Python shell, Ray, or streaming), worker configuration, and runtime arguments. This guide focuses on five capabilities: ETL jobs with Spark and auto-scaling, Python shell jobs for lightweight scripts, Ray jobs for distributed Python, streaming jobs for real-time processing, and CloudWatch logging and metrics.

Glue jobs require IAM execution roles, S3 buckets for script storage, and optionally Glue connections for database access or CloudWatch log groups for monitoring. The examples are intentionally small. Combine them with your own IAM policies, data sources, and monitoring infrastructure.

Run ETL jobs with Spark and auto-scaling

Most Glue deployments start with ETL jobs that transform data using Spark, with managed workers and CloudWatch logging for observability.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
    name: "glue-job-role",
    assumeRolePolicy: JSON.stringify({
        Version: "2012-10-17",
        Statement: [{
            Action: "sts:AssumeRole",
            Effect: "Allow",
            Principal: {
                Service: "glue.amazonaws.com",
            },
        }],
    }),
});
const etlJob = new aws.glue.Job("etl_job", {
    name: "example-etl-job",
    description: "An example Glue ETL job",
    roleArn: glueJobRole.arn,
    glueVersion: "5.0",
    maxRetries: 0,
    timeout: 2880,
    numberOfWorkers: 2,
    workerType: "G.1X",
    connections: [example.name],
    executionClass: "STANDARD",
    command: {
        scriptLocation: `s3://${glueScripts.bucket}/jobs/etl_job.py`,
        name: "glueetl",
        pythonVersion: "3",
    },
    notificationProperty: {
        notifyDelayAfter: 3,
    },
    defaultArguments: {
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "--enable-continuous-log-filter": "true",
        "--enable-metrics": "",
        "--enable-auto-scaling": "true",
    },
    executionProperty: {
        maxConcurrentRuns: 1,
    },
    tags: {
        ManagedBy: "AWS",
    },
});
const glueEtlScript = new aws.s3.BucketObjectv2("glue_etl_script", {
    bucket: glueScripts.id,
    key: "jobs/etl_job.py",
    source: new pulumi.asset.FileAsset("jobs/etl_job.py"),
});

import pulumi
import json
import pulumi_aws as aws

# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
    name="glue-job-role",
    assume_role_policy=json.dumps({
        "Version": "2012-10-17",
        "Statement": [{
            "Action": "sts:AssumeRole",
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com",
            },
        }],
    }))
etl_job = aws.glue.Job("etl_job",
    name="example-etl-job",
    description="An example Glue ETL job",
    role_arn=glue_job_role.arn,
    glue_version="5.0",
    max_retries=0,
    timeout=2880,
    number_of_workers=2,
    worker_type="G.1X",
    connections=[example["name"]],
    execution_class="STANDARD",
    command={
        "script_location": f"s3://{glue_scripts['bucket']}/jobs/etl_job.py",
        "name": "glueetl",
        "python_version": "3",
    },
    notification_property={
        "notify_delay_after": 3,
    },
    default_arguments={
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "--enable-continuous-log-filter": "true",
        "--enable-metrics": "",
        "--enable-auto-scaling": "true",
    },
    execution_property={
        "max_concurrent_runs": 1,
    },
    tags={
        "ManagedBy": "AWS",
    })
glue_etl_script = aws.s3.BucketObjectv2("glue_etl_script",
    bucket=glue_scripts["id"],
    key="jobs/etl_job.py",
    source=pulumi.FileAsset("jobs/etl_job.py"))

package main

import (
	"encoding/json"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Version": "2012-10-17",
			"Statement": []map[string]interface{}{
				map[string]interface{}{
					"Action": "sts:AssumeRole",
					"Effect": "Allow",
					"Principal": map[string]interface{}{
						"Service": "glue.amazonaws.com",
					},
				},
			},
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		// IAM role for Glue jobs
		glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
			Name:             pulumi.String("glue-job-role"),
			AssumeRolePolicy: pulumi.String(json0),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "etl_job", &glue.JobArgs{
			Name:            pulumi.String("example-etl-job"),
			Description:     pulumi.String("An example Glue ETL job"),
			RoleArn:         glueJobRole.Arn,
			GlueVersion:     pulumi.String("5.0"),
			MaxRetries:      pulumi.Int(0),
			Timeout:         pulumi.Int(2880),
			NumberOfWorkers: pulumi.Int(2),
			WorkerType:      pulumi.String("G.1X"),
			Connections: pulumi.StringArray{
				example.Name,
			},
			ExecutionClass: pulumi.String("STANDARD"),
			Command: &glue.JobCommandArgs{
				ScriptLocation: pulumi.Sprintf("s3://%v/jobs/etl_job.py", glueScripts.Bucket),
				Name:           pulumi.String("glueetl"),
				PythonVersion:  pulumi.String("3"),
			},
			NotificationProperty: &glue.JobNotificationPropertyArgs{
				NotifyDelayAfter: pulumi.Int(3),
			},
			DefaultArguments: pulumi.StringMap{
				"--job-language":                     pulumi.String("python"),
				"--continuous-log-logGroup":          pulumi.String("/aws-glue/jobs"),
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"--enable-continuous-log-filter":     pulumi.String("true"),
				"--enable-metrics":                   pulumi.String(""),
				"--enable-auto-scaling":              pulumi.String("true"),
			},
			ExecutionProperty: &glue.JobExecutionPropertyArgs{
				MaxConcurrentRuns: pulumi.Int(1),
			},
			Tags: pulumi.StringMap{
				"ManagedBy": pulumi.String("AWS"),
			},
		})
		if err != nil {
			return err
		}
		_, err = s3.NewBucketObjectv2(ctx, "glue_etl_script", &s3.BucketObjectv2Args{
			Bucket: pulumi.Any(glueScripts.Id),
			Key:    pulumi.String("jobs/etl_job.py"),
			Source: pulumi.NewFileAsset("jobs/etl_job.py"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    // IAM role for Glue jobs
    var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
    {
        Name = "glue-job-role",
        AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Version"] = "2012-10-17",
            ["Statement"] = new[]
            {
                new Dictionary<string, object?>
                {
                    ["Action"] = "sts:AssumeRole",
                    ["Effect"] = "Allow",
                    ["Principal"] = new Dictionary<string, object?>
                    {
                        ["Service"] = "glue.amazonaws.com",
                    },
                },
            },
        }),
    });

    var etlJob = new Aws.Glue.Job("etl_job", new()
    {
        Name = "example-etl-job",
        Description = "An example Glue ETL job",
        RoleArn = glueJobRole.Arn,
        GlueVersion = "5.0",
        MaxRetries = 0,
        Timeout = 2880,
        NumberOfWorkers = 2,
        WorkerType = "G.1X",
        Connections = new[]
        {
            example.Name,
        },
        ExecutionClass = "STANDARD",
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/etl_job.py",
            Name = "glueetl",
            PythonVersion = "3",
        },
        NotificationProperty = new Aws.Glue.Inputs.JobNotificationPropertyArgs
        {
            NotifyDelayAfter = 3,
        },
        DefaultArguments = 
        {
            { "--job-language", "python" },
            { "--continuous-log-logGroup", "/aws-glue/jobs" },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "--enable-continuous-log-filter", "true" },
            { "--enable-metrics", "" },
            { "--enable-auto-scaling", "true" },
        },
        ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
        {
            MaxConcurrentRuns = 1,
        },
        Tags = 
        {
            { "ManagedBy", "AWS" },
        },
    });

    var glueEtlScript = new Aws.S3.BucketObjectv2("glue_etl_script", new()
    {
        Bucket = glueScripts.Id,
        Key = "jobs/etl_job.py",
        Source = new FileAsset("jobs/etl_job.py"),
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobNotificationPropertyArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        // IAM role for Glue jobs
        var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
            .name("glue-job-role")
            .assumeRolePolicy(serializeJson(
                jsonObject(
                    jsonProperty("Version", "2012-10-17"),
                    jsonProperty("Statement", jsonArray(jsonObject(
                        jsonProperty("Action", "sts:AssumeRole"),
                        jsonProperty("Effect", "Allow"),
                        jsonProperty("Principal", jsonObject(
                            jsonProperty("Service", "glue.amazonaws.com")
                        ))
                    )))
                )))
            .build());

        var etlJob = new Job("etlJob", JobArgs.builder()
            .name("example-etl-job")
            .description("An example Glue ETL job")
            .roleArn(glueJobRole.arn())
            .glueVersion("5.0")
            .maxRetries(0)
            .timeout(2880)
            .numberOfWorkers(2)
            .workerType("G.1X")
            .connections(example.name())
            .executionClass("STANDARD")
            .command(JobCommandArgs.builder()
                .scriptLocation(String.format("s3://%s/jobs/etl_job.py", glueScripts.bucket()))
                .name("glueetl")
                .pythonVersion("3")
                .build())
            .notificationProperty(JobNotificationPropertyArgs.builder()
                .notifyDelayAfter(3)
                .build())
            .defaultArguments(Map.ofEntries(
                Map.entry("--job-language", "python"),
                Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("--enable-continuous-log-filter", "true"),
                Map.entry("--enable-metrics", ""),
                Map.entry("--enable-auto-scaling", "true")
            ))
            .executionProperty(JobExecutionPropertyArgs.builder()
                .maxConcurrentRuns(1)
                .build())
            .tags(Map.of("ManagedBy", "AWS"))
            .build());

        var glueEtlScript = new BucketObjectv2("glueEtlScript", BucketObjectv2Args.builder()
            .bucket(glueScripts.id())
            .key("jobs/etl_job.py")
            .source(new FileAsset("jobs/etl_job.py"))
            .build());

    }
}

resources:
  etlJob:
    type: aws:glue:Job
    name: etl_job
    properties:
      name: example-etl-job
      description: An example Glue ETL job
      roleArn: ${glueJobRole.arn}
      glueVersion: '5.0'
      maxRetries: 0
      timeout: 2880
      numberOfWorkers: 2
      workerType: G.1X
      connections:
        - ${example.name}
      executionClass: STANDARD
      command:
        scriptLocation: s3://${glueScripts.bucket}/jobs/etl_job.py
        name: glueetl
        pythonVersion: '3'
      notificationProperty:
        notifyDelayAfter: 3
      defaultArguments:
        --job-language: python
        --continuous-log-logGroup: /aws-glue/jobs
        --enable-continuous-cloudwatch-log: 'true'
        --enable-continuous-log-filter: 'true'
        --enable-metrics: ""
        --enable-auto-scaling: 'true'
      executionProperty:
        maxConcurrentRuns: 1
      tags:
        ManagedBy: AWS
  # IAM role for Glue jobs
  glueJobRole:
    type: aws:iam:Role
    name: glue_job_role
    properties:
      name: glue-job-role
      assumeRolePolicy:
        fn::toJSON:
          Version: 2012-10-17
          Statement:
            - Action: sts:AssumeRole
              Effect: Allow
              Principal:
                Service: glue.amazonaws.com
  glueEtlScript:
    type: aws:s3:BucketObjectv2
    name: glue_etl_script
    properties:
      bucket: ${glueScripts.id}
      key: jobs/etl_job.py
      source:
        fn::FileAsset: jobs/etl_job.py

The command block specifies the script location in S3 and sets the job type to glueetl (Spark-based ETL). The workerType and numberOfWorkers properties define compute capacity; G.1X workers provide 1 DPU each. The defaultArguments map controls Glue-specific behavior: auto-scaling adjusts worker count dynamically, and CloudWatch integration sends logs to the specified log group. The glueVersion determines which Spark and Python versions run your code.

Run lightweight Python scripts without Spark

Python shell jobs execute simple scripts that don’t need Spark’s distributed processing, such as API calls or data validation.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

// IAM role for Glue jobs
const glueJobRole = new aws.iam.Role("glue_job_role", {
    name: "glue-job-role",
    assumeRolePolicy: JSON.stringify({
        Version: "2012-10-17",
        Statement: [{
            Action: "sts:AssumeRole",
            Effect: "Allow",
            Principal: {
                Service: "glue.amazonaws.com",
            },
        }],
    }),
});
const pythonShellJob = new aws.glue.Job("python_shell_job", {
    name: "example-python-shell-job",
    description: "An example Python shell job",
    roleArn: glueJobRole.arn,
    maxCapacity: 0.0625,
    maxRetries: 0,
    timeout: 2880,
    connections: [example.name],
    command: {
        scriptLocation: `s3://${glueScripts.bucket}/jobs/shell_job.py`,
        name: "pythonshell",
        pythonVersion: "3.9",
    },
    defaultArguments: {
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "library-set": "analytics",
    },
    executionProperty: {
        maxConcurrentRuns: 1,
    },
    tags: {
        ManagedBy: "AWS",
    },
});
const pythonShellScript = new aws.s3.BucketObjectv2("python_shell_script", {
    bucket: glueScripts.id,
    key: "jobs/shell_job.py",
    source: new pulumi.asset.FileAsset("jobs/shell_job.py"),
});

import pulumi
import json
import pulumi_aws as aws

# IAM role for Glue jobs
glue_job_role = aws.iam.Role("glue_job_role",
    name="glue-job-role",
    assume_role_policy=json.dumps({
        "Version": "2012-10-17",
        "Statement": [{
            "Action": "sts:AssumeRole",
            "Effect": "Allow",
            "Principal": {
                "Service": "glue.amazonaws.com",
            },
        }],
    }))
python_shell_job = aws.glue.Job("python_shell_job",
    name="example-python-shell-job",
    description="An example Python shell job",
    role_arn=glue_job_role.arn,
    max_capacity=0.0625,
    max_retries=0,
    timeout=2880,
    connections=[example["name"]],
    command={
        "script_location": f"s3://{glue_scripts['bucket']}/jobs/shell_job.py",
        "name": "pythonshell",
        "python_version": "3.9",
    },
    default_arguments={
        "--job-language": "python",
        "--continuous-log-logGroup": "/aws-glue/jobs",
        "--enable-continuous-cloudwatch-log": "true",
        "library-set": "analytics",
    },
    execution_property={
        "max_concurrent_runs": 1,
    },
    tags={
        "ManagedBy": "AWS",
    })
python_shell_script = aws.s3.BucketObjectv2("python_shell_script",
    bucket=glue_scripts["id"],
    key="jobs/shell_job.py",
    source=pulumi.FileAsset("jobs/shell_job.py"))

package main

import (
	"encoding/json"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/iam"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/s3"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Version": "2012-10-17",
			"Statement": []map[string]interface{}{
				map[string]interface{}{
					"Action": "sts:AssumeRole",
					"Effect": "Allow",
					"Principal": map[string]interface{}{
						"Service": "glue.amazonaws.com",
					},
				},
			},
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		// IAM role for Glue jobs
		glueJobRole, err := iam.NewRole(ctx, "glue_job_role", &iam.RoleArgs{
			Name:             pulumi.String("glue-job-role"),
			AssumeRolePolicy: pulumi.String(json0),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "python_shell_job", &glue.JobArgs{
			Name:        pulumi.String("example-python-shell-job"),
			Description: pulumi.String("An example Python shell job"),
			RoleArn:     glueJobRole.Arn,
			MaxCapacity: pulumi.Float64(0.0625),
			MaxRetries:  pulumi.Int(0),
			Timeout:     pulumi.Int(2880),
			Connections: pulumi.StringArray{
				example.Name,
			},
			Command: &glue.JobCommandArgs{
				ScriptLocation: pulumi.Sprintf("s3://%v/jobs/shell_job.py", glueScripts.Bucket),
				Name:           pulumi.String("pythonshell"),
				PythonVersion:  pulumi.String("3.9"),
			},
			DefaultArguments: pulumi.StringMap{
				"--job-language":                     pulumi.String("python"),
				"--continuous-log-logGroup":          pulumi.String("/aws-glue/jobs"),
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"library-set":                        pulumi.String("analytics"),
			},
			ExecutionProperty: &glue.JobExecutionPropertyArgs{
				MaxConcurrentRuns: pulumi.Int(1),
			},
			Tags: pulumi.StringMap{
				"ManagedBy": pulumi.String("AWS"),
			},
		})
		if err != nil {
			return err
		}
		_, err = s3.NewBucketObjectv2(ctx, "python_shell_script", &s3.BucketObjectv2Args{
			Bucket: pulumi.Any(glueScripts.Id),
			Key:    pulumi.String("jobs/shell_job.py"),
			Source: pulumi.NewFileAsset("jobs/shell_job.py"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    // IAM role for Glue jobs
    var glueJobRole = new Aws.Iam.Role("glue_job_role", new()
    {
        Name = "glue-job-role",
        AssumeRolePolicy = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Version"] = "2012-10-17",
            ["Statement"] = new[]
            {
                new Dictionary<string, object?>
                {
                    ["Action"] = "sts:AssumeRole",
                    ["Effect"] = "Allow",
                    ["Principal"] = new Dictionary<string, object?>
                    {
                        ["Service"] = "glue.amazonaws.com",
                    },
                },
            },
        }),
    });

    var pythonShellJob = new Aws.Glue.Job("python_shell_job", new()
    {
        Name = "example-python-shell-job",
        Description = "An example Python shell job",
        RoleArn = glueJobRole.Arn,
        MaxCapacity = 0.0625,
        MaxRetries = 0,
        Timeout = 2880,
        Connections = new[]
        {
            example.Name,
        },
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            ScriptLocation = $"s3://{glueScripts.Bucket}/jobs/shell_job.py",
            Name = "pythonshell",
            PythonVersion = "3.9",
        },
        DefaultArguments = 
        {
            { "--job-language", "python" },
            { "--continuous-log-logGroup", "/aws-glue/jobs" },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "library-set", "analytics" },
        },
        ExecutionProperty = new Aws.Glue.Inputs.JobExecutionPropertyArgs
        {
            MaxConcurrentRuns = 1,
        },
        Tags = 
        {
            { "ManagedBy", "AWS" },
        },
    });

    var pythonShellScript = new Aws.S3.BucketObjectv2("python_shell_script", new()
    {
        Bucket = glueScripts.Id,
        Key = "jobs/shell_job.py",
        Source = new FileAsset("jobs/shell_job.py"),
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.iam.Role;
import com.pulumi.aws.iam.RoleArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import com.pulumi.aws.glue.inputs.JobExecutionPropertyArgs;
import com.pulumi.aws.s3.BucketObjectv2;
import com.pulumi.aws.s3.BucketObjectv2Args;
import com.pulumi.asset.FileAsset;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        // IAM role for Glue jobs
        var glueJobRole = new Role("glueJobRole", RoleArgs.builder()
            .name("glue-job-role")
            .assumeRolePolicy(serializeJson(
                jsonObject(
                    jsonProperty("Version", "2012-10-17"),
                    jsonProperty("Statement", jsonArray(jsonObject(
                        jsonProperty("Action", "sts:AssumeRole"),
                        jsonProperty("Effect", "Allow"),
                        jsonProperty("Principal", jsonObject(
                            jsonProperty("Service", "glue.amazonaws.com")
                        ))
                    )))
                )))
            .build());

        var pythonShellJob = new Job("pythonShellJob", JobArgs.builder()
            .name("example-python-shell-job")
            .description("An example Python shell job")
            .roleArn(glueJobRole.arn())
            .maxCapacity(0.0625)
            .maxRetries(0)
            .timeout(2880)
            .connections(example.name())
            .command(JobCommandArgs.builder()
                .scriptLocation(String.format("s3://%s/jobs/shell_job.py", glueScripts.bucket()))
                .name("pythonshell")
                .pythonVersion("3.9")
                .build())
            .defaultArguments(Map.ofEntries(
                Map.entry("--job-language", "python"),
                Map.entry("--continuous-log-logGroup", "/aws-glue/jobs"),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("library-set", "analytics")
            ))
            .executionProperty(JobExecutionPropertyArgs.builder()
                .maxConcurrentRuns(1)
                .build())
            .tags(Map.of("ManagedBy", "AWS"))
            .build());

        var pythonShellScript = new BucketObjectv2("pythonShellScript", BucketObjectv2Args.builder()
            .bucket(glueScripts.id())
            .key("jobs/shell_job.py")
            .source(new FileAsset("jobs/shell_job.py"))
            .build());

    }
}

resources:
  pythonShellJob:
    type: aws:glue:Job
    name: python_shell_job
    properties:
      name: example-python-shell-job
      description: An example Python shell job
      roleArn: ${glueJobRole.arn}
      maxCapacity: '0.0625'
      maxRetries: 0
      timeout: 2880
      connections:
        - ${example.name}
      command:
        scriptLocation: s3://${glueScripts.bucket}/jobs/shell_job.py
        name: pythonshell
        pythonVersion: '3.9'
      defaultArguments:
        --job-language: python
        --continuous-log-logGroup: /aws-glue/jobs
        --enable-continuous-cloudwatch-log: 'true'
        library-set: analytics
      executionProperty:
        maxConcurrentRuns: 1
      tags:
        ManagedBy: AWS
  # IAM role for Glue jobs
  glueJobRole:
    type: aws:iam:Role
    name: glue_job_role
    properties:
      name: glue-job-role
      assumeRolePolicy:
        fn::toJSON:
          Version: 2012-10-17
          Statement:
            - Action: sts:AssumeRole
              Effect: Allow
              Principal:
                Service: glue.amazonaws.com
  pythonShellScript:
    type: aws:s3:BucketObjectv2
    name: python_shell_script
    properties:
      bucket: ${glueScripts.id}
      key: jobs/shell_job.py
      source:
        fn::FileAsset: jobs/shell_job.py

The command name switches to pythonshell, which runs a single Python process instead of a Spark cluster. The maxCapacity property allocates fractional DPUs (0.0625 or 1.0), making these jobs cost-effective for lightweight tasks. The pythonVersion property controls which Python runtime executes your script.

Run distributed Python with Ray framework

Ray jobs enable distributed Python workloads using the Ray framework, providing more flexible parallelism than Spark for machine learning tasks.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Job("example", {
    name: "example",
    roleArn: exampleAwsIamRole.arn,
    glueVersion: "4.0",
    workerType: "Z.2X",
    command: {
        name: "glueray",
        pythonVersion: "3.9",
        runtime: "Ray2.4",
        scriptLocation: `s3://${exampleAwsS3Bucket.bucket}/example.py`,
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Job("example",
    name="example",
    role_arn=example_aws_iam_role["arn"],
    glue_version="4.0",
    worker_type="Z.2X",
    command={
        "name": "glueray",
        "python_version": "3.9",
        "runtime": "Ray2.4",
        "script_location": f"s3://{example_aws_s3_bucket['bucket']}/example.py",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewJob(ctx, "example", &glue.JobArgs{
			Name:        pulumi.String("example"),
			RoleArn:     pulumi.Any(exampleAwsIamRole.Arn),
			GlueVersion: pulumi.String("4.0"),
			WorkerType:  pulumi.String("Z.2X"),
			Command: &glue.JobCommandArgs{
				Name:           pulumi.String("glueray"),
				PythonVersion:  pulumi.String("3.9"),
				Runtime:        pulumi.String("Ray2.4"),
				ScriptLocation: pulumi.Sprintf("s3://%v/example.py", exampleAwsS3Bucket.Bucket),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Job("example", new()
    {
        Name = "example",
        RoleArn = exampleAwsIamRole.Arn,
        GlueVersion = "4.0",
        WorkerType = "Z.2X",
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            Name = "glueray",
            PythonVersion = "3.9",
            Runtime = "Ray2.4",
            ScriptLocation = $"s3://{exampleAwsS3Bucket.Bucket}/example.py",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Job("example", JobArgs.builder()
            .name("example")
            .roleArn(exampleAwsIamRole.arn())
            .glueVersion("4.0")
            .workerType("Z.2X")
            .command(JobCommandArgs.builder()
                .name("glueray")
                .pythonVersion("3.9")
                .runtime("Ray2.4")
                .scriptLocation(String.format("s3://%s/example.py", exampleAwsS3Bucket.bucket()))
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Job
    properties:
      name: example
      roleArn: ${exampleAwsIamRole.arn}
      glueVersion: '4.0'
      workerType: Z.2X
      command:
        name: glueray
        pythonVersion: '3.9'
        runtime: Ray2.4
        scriptLocation: s3://${exampleAwsS3Bucket.bucket}/example.py

The command name glueray activates Ray support. The runtime property specifies the Ray version (Ray2.4 in this example). Ray jobs require glueVersion 4.0 or higher and use Z.2X worker types. Unlike Spark ETL jobs, Ray jobs should leave the timeout property unconfigured.

Process continuous data streams

Streaming jobs run continuously to process data from sources like Kinesis or Kafka, transforming records as they arrive.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Job("example", {
    name: "example streaming job",
    roleArn: exampleAwsIamRole.arn,
    command: {
        name: "gluestreaming",
        scriptLocation: `s3://${exampleAwsS3Bucket.bucket}/example.script`,
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Job("example",
    name="example streaming job",
    role_arn=example_aws_iam_role["arn"],
    command={
        "name": "gluestreaming",
        "script_location": f"s3://{example_aws_s3_bucket['bucket']}/example.script",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewJob(ctx, "example", &glue.JobArgs{
			Name:    pulumi.String("example streaming job"),
			RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
			Command: &glue.JobCommandArgs{
				Name:           pulumi.String("gluestreaming"),
				ScriptLocation: pulumi.Sprintf("s3://%v/example.script", exampleAwsS3Bucket.Bucket),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Job("example", new()
    {
        Name = "example streaming job",
        RoleArn = exampleAwsIamRole.Arn,
        Command = new Aws.Glue.Inputs.JobCommandArgs
        {
            Name = "gluestreaming",
            ScriptLocation = $"s3://{exampleAwsS3Bucket.Bucket}/example.script",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import com.pulumi.aws.glue.inputs.JobCommandArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Job("example", JobArgs.builder()
            .name("example streaming job")
            .roleArn(exampleAwsIamRole.arn())
            .command(JobCommandArgs.builder()
                .name("gluestreaming")
                .scriptLocation(String.format("s3://%s/example.script", exampleAwsS3Bucket.bucket()))
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Job
    properties:
      name: example streaming job
      roleArn: ${exampleAwsIamRole.arn}
      command:
        name: gluestreaming
        scriptLocation: s3://${exampleAwsS3Bucket.bucket}/example.script

The command name gluestreaming configures the job for continuous execution. Streaming jobs default to unlimited timeout (0), running until explicitly stopped. They process records in micro-batches as data arrives, rather than waiting for complete datasets.

Enable CloudWatch logging and metrics

Glue jobs send logs and metrics to CloudWatch for monitoring execution and debugging failures.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.cloudwatch.LogGroup("example", {
    name: "example",
    retentionInDays: 14,
});
const exampleJob = new aws.glue.Job("example", {defaultArguments: {
    "--continuous-log-logGroup": example.name,
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
    "--enable-metrics": "",
}});

import pulumi
import pulumi_aws as aws

example = aws.cloudwatch.LogGroup("example",
    name="example",
    retention_in_days=14)
example_job = aws.glue.Job("example", default_arguments={
    "--continuous-log-logGroup": example.name,
    "--enable-continuous-cloudwatch-log": "true",
    "--enable-continuous-log-filter": "true",
    "--enable-metrics": "",
})

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/cloudwatch"
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		example, err := cloudwatch.NewLogGroup(ctx, "example", &cloudwatch.LogGroupArgs{
			Name:            pulumi.String("example"),
			RetentionInDays: pulumi.Int(14),
		})
		if err != nil {
			return err
		}
		_, err = glue.NewJob(ctx, "example", &glue.JobArgs{
			DefaultArguments: pulumi.StringMap{
				"--continuous-log-logGroup":          example.Name,
				"--enable-continuous-cloudwatch-log": pulumi.String("true"),
				"--enable-continuous-log-filter":     pulumi.String("true"),
				"--enable-metrics":                   pulumi.String(""),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.CloudWatch.LogGroup("example", new()
    {
        Name = "example",
        RetentionInDays = 14,
    });

    var exampleJob = new Aws.Glue.Job("example", new()
    {
        DefaultArguments = 
        {
            { "--continuous-log-logGroup", example.Name },
            { "--enable-continuous-cloudwatch-log", "true" },
            { "--enable-continuous-log-filter", "true" },
            { "--enable-metrics", "" },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.cloudwatch.LogGroup;
import com.pulumi.aws.cloudwatch.LogGroupArgs;
import com.pulumi.aws.glue.Job;
import com.pulumi.aws.glue.JobArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new LogGroup("example", LogGroupArgs.builder()
            .name("example")
            .retentionInDays(14)
            .build());

        var exampleJob = new Job("exampleJob", JobArgs.builder()
            .defaultArguments(Map.ofEntries(
                Map.entry("--continuous-log-logGroup", example.name()),
                Map.entry("--enable-continuous-cloudwatch-log", "true"),
                Map.entry("--enable-continuous-log-filter", "true"),
                Map.entry("--enable-metrics", "")
            ))
            .build());

    }
}

resources:
  example:
    type: aws:cloudwatch:LogGroup
    properties:
      name: example
      retentionInDays: 14
  exampleJob:
    type: aws:glue:Job
    name: example
    properties:
      defaultArguments:
        --continuous-log-logGroup: ${example.name}
        --enable-continuous-cloudwatch-log: 'true'
        --enable-continuous-log-filter: 'true'
        --enable-metrics: ""

The defaultArguments map controls CloudWatch integration. The continuous-log-logGroup property specifies where logs are written. Setting enable-continuous-cloudwatch-log to “true” streams logs in real-time. The enable-metrics flag (empty string value) activates CloudWatch metrics for job monitoring. These arguments work with any job type.

Beyond these examples

These snippets focus on specific Glue job features: ETL, Python shell, Ray, and streaming job types, worker configuration and auto-scaling, and CloudWatch logging and metrics. They’re intentionally minimal rather than full data pipelines.

The examples reference pre-existing infrastructure such as IAM execution roles with Glue service trust, S3 buckets for script storage, Glue connections for database access, and CloudWatch log groups. They focus on configuring the job rather than provisioning the surrounding infrastructure.

To keep things focused, common job patterns are omitted, including:

Security configurations for encryption
Job run queuing and maintenance windows
Retry policies and timeout tuning
Non-overridable arguments and execution concurrency
Source control integration

These omissions are intentional: the goal is to illustrate how each job type is wired, not provide drop-in ETL pipelines. See the Glue Job resource reference for all available configuration options.

Let's create AWS Glue ETL Jobs

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Job Types & Configuration

What are the different types of Glue jobs I can create?

Glue supports four job types via command.name: glueetl (ETL jobs), pythonshell (Python shell scripts), glueray (Ray jobs), and gluestreaming (streaming jobs). Each type has different configuration requirements.

How do I create a Ray job in Glue?

Set glueVersion to 4.0 or higher, workerType to Z.2X, and configure command with name: "glueray", pythonVersion: "3.9", and a runtime like Ray2.4.

How do I create a Python shell job?

Set command.name to pythonshell, specify maxCapacity as either 0.0625 or 1.0, and configure pythonVersion in the command block (e.g., 3.9).

How do I run a Scala job?

Point command.scriptLocation to your .scala file in S3 and set defaultArguments with "--job-language": "scala".

Worker & Capacity Settings

When should I use maxCapacity vs numberOfWorkers and workerType?

Use maxCapacity (0.0625 or 1.0) for pythonshell jobs. For glueVersion 2.0 and above, use numberOfWorkers and workerType instead of maxCapacity.

What worker types are available for Glue jobs?

Valid workerType values include Standard, G.1X, G.2X, G.025X, G.4X, G.8X, G.12X, G.16X, R.1X, R.2X, R.4X, R.8X, and Z.2X (for Ray jobs).

What's the difference between FLEX and STANDARD execution classes?

STANDARD execution is ideal for time-sensitive workloads requiring fast job startup and dedicated resources. FLEX provides more flexible resource allocation.

Monitoring & Logging

How do I enable CloudWatch logging and metrics for my Glue job?

Configure defaultArguments with "--continuous-log-logGroup": "/aws-glue/jobs", "--enable-continuous-cloudwatch-log": "true", "--enable-continuous-log-filter": "true", and "--enable-metrics": "".

Where do I configure monitoring and logging settings?

Glue monitoring and logging are typically managed through the defaultArguments property, which accepts AWS Glue special parameters for CloudWatch integration.

Timeouts & Execution

What are the default timeouts for different Glue job types?

glueetl and pythonshell jobs default to 2880 minutes (48 hours). gluestreaming jobs default to 0 (unlimited). Leave timeout unconfigured for glueray jobs.

Common Issues

Can I rename a Glue job after creation?

No, the name property is immutable and cannot be changed after the job is created.

What Glue version do I need for Ray jobs?

Ray jobs require glueVersion set to 4.0 or greater.

Using a different cloud?

Explore analytics guides for other cloud providers:

Azure Guides GCP Guides