Configure AWS Glue Catalog Table Optimizer

The aws:glue/catalogTableOptimizer:CatalogTableOptimizer resource, part of the Pulumi AWS provider, configures automated maintenance tasks for Glue Catalog tables: file compaction, snapshot retention, and orphan file cleanup. This guide focuses on three optimizer types: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file deletion.

Table optimizers operate on existing Glue Catalog tables and require IAM roles with Glue and S3 permissions. The examples are intentionally small. Combine them with your own Glue databases, tables, and IAM roles.

Compact small files into larger ones

Data lakes accumulate many small files as streaming writes create fragmentation. Compaction merges these files, improving query performance and reducing metadata overhead.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
    },
    type: "compaction",
});
import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
    },
    type="compaction")
package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
			},
			Type: pulumi.String("compaction"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
        },
        Type = "compaction",
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .build())
            .type("compaction")
            .build());

    }
}
resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
      type: compaction

The type property set to “compaction” enables automatic file merging. The configuration block specifies the IAM role that executes compaction jobs and the enabled flag controls whether the optimizer runs. Glue schedules compaction automatically based on table activity.

Clean up old Iceberg table snapshots

Iceberg tables create snapshots for time travel, but these accumulate storage costs. Retention policies remove old snapshots while preserving recent history.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
        retentionConfiguration: {
            icebergConfiguration: {
                snapshotRetentionPeriodInDays: 7,
                numberOfSnapshotsToRetain: 3,
                cleanExpiredFiles: true,
            },
        },
    },
    type: "retention",
});
import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
        "retention_configuration": {
            "iceberg_configuration": {
                "snapshot_retention_period_in_days": 7,
                "number_of_snapshots_to_retain": 3,
                "clean_expired_files": True,
            },
        },
    },
    type="retention")
package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
				RetentionConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationArgs{
					IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs{
						SnapshotRetentionPeriodInDays: pulumi.Int(7),
						NumberOfSnapshotsToRetain:     pulumi.Int(3),
						CleanExpiredFiles:             pulumi.Bool(true),
					},
				},
			},
			Type: pulumi.String("retention"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
            RetentionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs
            {
                IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs
                {
                    SnapshotRetentionPeriodInDays = 7,
                    NumberOfSnapshotsToRetain = 3,
                    CleanExpiredFiles = true,
                },
            },
        },
        Type = "retention",
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .retentionConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationArgs.builder()
                    .icebergConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs.builder()
                        .snapshotRetentionPeriodInDays(7)
                        .numberOfSnapshotsToRetain(3)
                        .cleanExpiredFiles(true)
                        .build())
                    .build())
                .build())
            .type("retention")
            .build());

    }
}
resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
        retentionConfiguration:
          icebergConfiguration:
            snapshotRetentionPeriodInDays: 7
            numberOfSnapshotsToRetain: 3
            cleanExpiredFiles: true
      type: retention

The type property set to “retention” enables snapshot cleanup for Iceberg tables. The retentionConfiguration block defines how many snapshots to keep: snapshotRetentionPeriodInDays sets the time window, numberOfSnapshotsToRetain sets the count limit, and cleanExpiredFiles removes data files no longer referenced by retained snapshots.

Remove orphaned files from table storage

Failed writes and incomplete transactions leave files in S3 that aren’t referenced by table metadata. These orphaned files consume storage without contributing to queries.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.CatalogTableOptimizer("example", {
    catalogId: "123456789012",
    databaseName: "example_database",
    tableName: "example_table",
    configuration: {
        roleArn: "arn:aws:iam::123456789012:role/example-role",
        enabled: true,
        orphanFileDeletionConfiguration: {
            icebergConfiguration: {
                orphanFileRetentionPeriodInDays: 7,
                location: "s3://example-bucket/example_table/",
            },
        },
    },
    type: "orphan_file_deletion",
});
import pulumi
import pulumi_aws as aws

example = aws.glue.CatalogTableOptimizer("example",
    catalog_id="123456789012",
    database_name="example_database",
    table_name="example_table",
    configuration={
        "role_arn": "arn:aws:iam::123456789012:role/example-role",
        "enabled": True,
        "orphan_file_deletion_configuration": {
            "iceberg_configuration": {
                "orphan_file_retention_period_in_days": 7,
                "location": "s3://example-bucket/example_table/",
            },
        },
    },
    type="orphan_file_deletion")
package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
			CatalogId:    pulumi.String("123456789012"),
			DatabaseName: pulumi.String("example_database"),
			TableName:    pulumi.String("example_table"),
			Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
				RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
				Enabled: pulumi.Bool(true),
				OrphanFileDeletionConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs{
					IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs{
						OrphanFileRetentionPeriodInDays: pulumi.Int(7),
						Location:                        pulumi.String("s3://example-bucket/example_table/"),
					},
				},
			},
			Type: pulumi.String("orphan_file_deletion"),
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.CatalogTableOptimizer("example", new()
    {
        CatalogId = "123456789012",
        DatabaseName = "example_database",
        TableName = "example_table",
        Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
        {
            RoleArn = "arn:aws:iam::123456789012:role/example-role",
            Enabled = true,
            OrphanFileDeletionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs
            {
                IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs
                {
                    OrphanFileRetentionPeriodInDays = 7,
                    Location = "s3://example-bucket/example_table/",
                },
            },
        },
        Type = "orphan_file_deletion",
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
            .catalogId("123456789012")
            .databaseName("example_database")
            .tableName("example_table")
            .configuration(CatalogTableOptimizerConfigurationArgs.builder()
                .roleArn("arn:aws:iam::123456789012:role/example-role")
                .enabled(true)
                .orphanFileDeletionConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs.builder()
                    .icebergConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs.builder()
                        .orphanFileRetentionPeriodInDays(7)
                        .location("s3://example-bucket/example_table/")
                        .build())
                    .build())
                .build())
            .type("orphan_file_deletion")
            .build());

    }
}
resources:
  example:
    type: aws:glue:CatalogTableOptimizer
    properties:
      catalogId: '123456789012'
      databaseName: example_database
      tableName: example_table
      configuration:
        roleArn: arn:aws:iam::123456789012:role/example-role
        enabled: true
        orphanFileDeletionConfiguration:
          icebergConfiguration:
            orphanFileRetentionPeriodInDays: 7
            location: s3://example-bucket/example_table/
      type: orphan_file_deletion

The type property set to “orphan_file_deletion” enables cleanup of unreferenced files. The orphanFileDeletionConfiguration block specifies orphanFileRetentionPeriodInDays (how long files must be unreferenced before deletion) and location (the S3 path to scan). The optimizer identifies files not in table metadata and removes them after the retention period.

Beyond these examples

These snippets focus on specific table optimizer features: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file cleanup. They’re intentionally minimal rather than full data lake maintenance solutions.

The examples reference pre-existing infrastructure such as Glue Data Catalog with existing database and table, and IAM role with permissions for Glue and S3 operations. They focus on configuring the optimizer rather than provisioning the underlying catalog resources.

To keep things focused, common optimizer patterns are omitted, including:

  • Optimizer scheduling and frequency controls
  • Monitoring and metrics for optimizer runs
  • Cost estimation and resource limits
  • Integration with Glue workflows or triggers

These omissions are intentional: the goal is to illustrate how each optimizer type is wired, not provide drop-in data lake modules. See the Glue Catalog Table Optimizer resource reference for all available configuration options.

Let's configure AWS Glue Catalog Table Optimizer

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Optimizer Types & Configuration
What types of table optimizers are available?
Three optimizer types are available: compaction, retention, and orphan_file_deletion.
What's the difference between compaction, retention, and orphan file deletion optimizers?

Each optimizer serves a different purpose:

  • Compaction - Optimizes table file layout (basic configuration with roleArn and enabled flag)
  • Retention - Manages Iceberg snapshot retention with configurable retention periods and snapshot counts
  • Orphan file deletion - Cleans up orphaned files with configurable retention periods and S3 locations
How do I configure snapshot retention for Iceberg tables?
Use the retention optimizer type with retentionConfiguration.icebergConfiguration. You can set snapshotRetentionPeriodInDays, numberOfSnapshotsToRetain, and cleanExpiredFiles.
What configuration is needed for orphan file deletion?
Use the orphan_file_deletion type with orphanFileDeletionConfiguration.icebergConfiguration. Specify orphanFileRetentionPeriodInDays and the S3 location where orphaned files reside.
IAM & Permissions
Do I need an IAM role for table optimizers?
Yes, all optimizer types require a roleArn in the configuration block.
Import & Management
How do I import an existing table optimizer?
Use the format catalog_id,database_name,table_name,type. For example: 123456789012,example_database,example_table,compaction.

Using a different cloud?

Explore analytics guides for other cloud providers: