Configure AWS Glue Catalog Table Optimizer

The aws:glue/catalogTableOptimizer:CatalogTableOptimizer resource, part of the Pulumi AWS provider, configures automated maintenance tasks for Glue Catalog tables: file compaction, snapshot retention, and orphan file cleanup. This guide focuses on three optimizer types: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file deletion.

Table optimizers reference existing Glue Catalog tables and require IAM roles with permissions for Glue and S3 operations. The examples are intentionally small. Combine them with your own Glue tables, IAM roles, and monitoring configuration.

Compact small files into larger ones

Data lakes accumulate many small files as streaming writes create fragmentation. Compaction merges these files, improving query performance and reducing metadata overhead.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
    },
    type: "compaction",
});

import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
    },
    type="compaction")

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
			},
			Type: pulumi.String("compaction"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
        },
        Type = "compaction",
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .build())
            .type("compaction")
            .build());

    }
}

resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
      type: compaction

The type property set to “compaction” enables automatic file merging. The configuration block specifies the IAM role that Glue assumes to perform compaction operations. Setting enabled to true activates the optimizer; Glue schedules compaction runs automatically based on table activity.

Clean up old Iceberg table snapshots

Iceberg tables create snapshots for time travel, but these accumulate storage costs. Retention policies delete old snapshots while preserving recent history.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
        retentionConfiguration: {
            icebergConfiguration: {
                snapshotRetentionPeriodInDays: 7,
                numberOfSnapshotsToRetain: 3,
                cleanExpiredFiles: true,
            },
        },
    },
    type: "retention",
});

import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
        "retention_configuration": {
            "iceberg_configuration": {
                "snapshot_retention_period_in_days": 7,
                "number_of_snapshots_to_retain": 3,
                "clean_expired_files": True,
            },
        },
    },
    type="retention")

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
				RetentionConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationArgs{
					IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs{
						SnapshotRetentionPeriodInDays: pulumi.Int(7),
						NumberOfSnapshotsToRetain:     pulumi.Int(3),
						CleanExpiredFiles:             pulumi.Bool(true),
					},
				},
			},
			Type: pulumi.String("retention"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
            RetentionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs
            {
                IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs
                {
                    SnapshotRetentionPeriodInDays = 7,
                    NumberOfSnapshotsToRetain = 3,
                    CleanExpiredFiles = true,
                },
            },
        },
        Type = "retention",
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .retentionConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationArgs.builder()
                    .icebergConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs.builder()
                        .snapshotRetentionPeriodInDays(7)
                        .numberOfSnapshotsToRetain(3)
                        .cleanExpiredFiles(true)
                        .build())
                    .build())
                .build())
            .type("retention")
            .build());

    }
}

resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
        retentionConfiguration:
          icebergConfiguration:
            snapshotRetentionPeriodInDays: 7
            numberOfSnapshotsToRetain: 3
            cleanExpiredFiles: true
      type: retention

The type property set to “retention” enables snapshot cleanup for Iceberg tables. The retentionConfiguration block controls how many snapshots to keep: snapshotRetentionPeriodInDays sets the time window, numberOfSnapshotsToRetain sets the count limit, and cleanExpiredFiles determines whether to delete data files from expired snapshots. Glue applies whichever limit is reached first.

Remove orphaned files from table storage

Failed writes or incomplete transactions leave files in S3 that table metadata no longer references. These orphaned files consume storage without contributing to queries.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
        orphanFileDeletionConfiguration: {
            icebergConfiguration: {
                orphanFileRetentionPeriodInDays: 7,
                location: "s3://example-bucket/example_table/",
            },
        },
    },
    type: "orphan_file_deletion",
});

import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
        "orphan_file_deletion_configuration": {
            "iceberg_configuration": {
                "orphan_file_retention_period_in_days": 7,
                "location": "s3://example-bucket/example_table/",
            },
        },
    },
    type="orphan_file_deletion")

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
				OrphanFileDeletionConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs{
					IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs{
						OrphanFileRetentionPeriodInDays: pulumi.Int(7),
						Location:                        pulumi.String("s3://example-bucket/example_table/"),
					},
				},
			},
			Type: pulumi.String("orphan_file_deletion"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
            OrphanFileDeletionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs
            {
                IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs
                {
                    OrphanFileRetentionPeriodInDays = 7,
                    Location = "s3://example-bucket/example_table/",
                },
            },
        },
        Type = "orphan_file_deletion",
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .orphanFileDeletionConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs.builder()
                    .icebergConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs.builder()
                        .orphanFileRetentionPeriodInDays(7)
                        .location("s3://example-bucket/example_table/")
                        .build())
                    .build())
                .build())
            .type("orphan_file_deletion")
            .build());

    }
}

resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
        orphanFileDeletionConfiguration:
          icebergConfiguration:
            orphanFileRetentionPeriodInDays: 7
            location: s3://example-bucket/example_table/
      type: orphan_file_deletion

The type property set to “orphan_file_deletion” enables cleanup of unreferenced files. The orphanFileDeletionConfiguration block specifies orphanFileRetentionPeriodInDays (how long to wait before deleting files) and location (the S3 path to scan). Glue identifies files not referenced by table metadata and removes them after the retention period.

Beyond these examples

These snippets focus on specific table optimizer features: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file cleanup. They’re intentionally minimal rather than complete data lake maintenance solutions.

The examples reference pre-existing infrastructure such as Glue Data Catalog with database and table, IAM role with Glue and S3 permissions, and S3 bucket for table storage (orphan file deletion). They focus on optimizer configuration rather than provisioning the surrounding infrastructure.

To keep things focused, common optimizer patterns are omitted, including:

Optimizer scheduling and run frequency
Monitoring optimizer runs and metrics
Cost estimation for optimizer operations
Integration with Glue jobs and crawlers

These omissions are intentional: the goal is to illustrate how each optimizer type is wired, not provide drop-in data lake modules. See the Glue Catalog Table Optimizer resource reference for all available configuration options.

Let's configure AWS Glue Catalog Table Optimizer

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Optimizer Types & Configuration

What optimizer types are available for Glue Catalog tables?

Three optimizer types are available: compaction for file compaction, retention for snapshot retention management, and orphan_file_deletion for cleaning up orphaned files.

How do I configure each optimizer type?

Each type requires different configuration:

Compaction - Basic config with roleArn and enabled
Retention - Add retentionConfiguration with icebergConfiguration containing snapshotRetentionPeriodInDays, numberOfSnapshotsToRetain, and cleanExpiredFiles
Orphan file deletion - Add orphanFileDeletionConfiguration with icebergConfiguration containing orphanFileRetentionPeriodInDays and location

Can I enable or disable an optimizer without deleting it?

Yes, use the enabled boolean in the configuration block to control whether the optimizer runs.

IAM & Permissions

What IAM role do I need for table optimizers?

All optimizer types require a roleArn in the configuration block. The role must have permissions appropriate for the optimizer type’s operations.

Resource Management

How do I import an existing Glue Catalog Table Optimizer?

Use the format catalog_id,database_name,table_name,type. For example: 123456789012,example_database,example_table,compaction

Using a different cloud?

Explore analytics guides for other cloud providers:

Azure Guides GCP Guides