The aws:glue/catalogTableOptimizer:CatalogTableOptimizer resource, part of the Pulumi AWS provider, configures automated maintenance tasks for Glue Catalog tables: file compaction, snapshot retention, and orphan file cleanup. This guide focuses on three optimizer types: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file deletion.
Table optimizers reference existing Glue Catalog tables and require IAM roles with permissions for Glue and S3 operations. The examples are intentionally small. Combine them with your own Glue tables, IAM roles, and monitoring configuration.
Compact small files into larger ones
Data lakes accumulate many small files as streaming writes create fragmentation. Compaction merges these files, improving query performance and reducing metadata overhead.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.CatalogTableOptimizer("example", {
catalogId: "123456789012",
databaseName: "example_database",
tableName: "example_table",
configuration: {
roleArn: "arn:aws:iam::123456789012:role/example-role",
enabled: true,
},
type: "compaction",
});
import pulumi
import pulumi_aws as aws
example = aws.glue.CatalogTableOptimizer("example",
catalog_id="123456789012",
database_name="example_database",
table_name="example_table",
configuration={
"role_arn": "arn:aws:iam::123456789012:role/example-role",
"enabled": True,
},
type="compaction")
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
CatalogId: pulumi.String("123456789012"),
DatabaseName: pulumi.String("example_database"),
TableName: pulumi.String("example_table"),
Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
Enabled: pulumi.Bool(true),
},
Type: pulumi.String("compaction"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.CatalogTableOptimizer("example", new()
{
CatalogId = "123456789012",
DatabaseName = "example_database",
TableName = "example_table",
Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
{
RoleArn = "arn:aws:iam::123456789012:role/example-role",
Enabled = true,
},
Type = "compaction",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
.catalogId("123456789012")
.databaseName("example_database")
.tableName("example_table")
.configuration(CatalogTableOptimizerConfigurationArgs.builder()
.roleArn("arn:aws:iam::123456789012:role/example-role")
.enabled(true)
.build())
.type("compaction")
.build());
}
}
resources:
example:
type: aws:glue:CatalogTableOptimizer
properties:
catalogId: '123456789012'
databaseName: example_database
tableName: example_table
configuration:
roleArn: arn:aws:iam::123456789012:role/example-role
enabled: true
type: compaction
The type property set to “compaction” enables automatic file merging. The configuration block specifies the IAM role that Glue assumes to perform compaction operations. Setting enabled to true activates the optimizer; Glue schedules compaction runs automatically based on table activity.
Clean up old Iceberg table snapshots
Iceberg tables create snapshots for time travel, but these accumulate storage costs. Retention policies delete old snapshots while preserving recent history.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.CatalogTableOptimizer("example", {
catalogId: "123456789012",
databaseName: "example_database",
tableName: "example_table",
configuration: {
roleArn: "arn:aws:iam::123456789012:role/example-role",
enabled: true,
retentionConfiguration: {
icebergConfiguration: {
snapshotRetentionPeriodInDays: 7,
numberOfSnapshotsToRetain: 3,
cleanExpiredFiles: true,
},
},
},
type: "retention",
});
import pulumi
import pulumi_aws as aws
example = aws.glue.CatalogTableOptimizer("example",
catalog_id="123456789012",
database_name="example_database",
table_name="example_table",
configuration={
"role_arn": "arn:aws:iam::123456789012:role/example-role",
"enabled": True,
"retention_configuration": {
"iceberg_configuration": {
"snapshot_retention_period_in_days": 7,
"number_of_snapshots_to_retain": 3,
"clean_expired_files": True,
},
},
},
type="retention")
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
CatalogId: pulumi.String("123456789012"),
DatabaseName: pulumi.String("example_database"),
TableName: pulumi.String("example_table"),
Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
Enabled: pulumi.Bool(true),
RetentionConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationArgs{
IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs{
SnapshotRetentionPeriodInDays: pulumi.Int(7),
NumberOfSnapshotsToRetain: pulumi.Int(3),
CleanExpiredFiles: pulumi.Bool(true),
},
},
},
Type: pulumi.String("retention"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.CatalogTableOptimizer("example", new()
{
CatalogId = "123456789012",
DatabaseName = "example_database",
TableName = "example_table",
Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
{
RoleArn = "arn:aws:iam::123456789012:role/example-role",
Enabled = true,
RetentionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs
{
IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs
{
SnapshotRetentionPeriodInDays = 7,
NumberOfSnapshotsToRetain = 3,
CleanExpiredFiles = true,
},
},
},
Type = "retention",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
.catalogId("123456789012")
.databaseName("example_database")
.tableName("example_table")
.configuration(CatalogTableOptimizerConfigurationArgs.builder()
.roleArn("arn:aws:iam::123456789012:role/example-role")
.enabled(true)
.retentionConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationArgs.builder()
.icebergConfiguration(CatalogTableOptimizerConfigurationRetentionConfigurationIcebergConfigurationArgs.builder()
.snapshotRetentionPeriodInDays(7)
.numberOfSnapshotsToRetain(3)
.cleanExpiredFiles(true)
.build())
.build())
.build())
.type("retention")
.build());
}
}
resources:
example:
type: aws:glue:CatalogTableOptimizer
properties:
catalogId: '123456789012'
databaseName: example_database
tableName: example_table
configuration:
roleArn: arn:aws:iam::123456789012:role/example-role
enabled: true
retentionConfiguration:
icebergConfiguration:
snapshotRetentionPeriodInDays: 7
numberOfSnapshotsToRetain: 3
cleanExpiredFiles: true
type: retention
The type property set to “retention” enables snapshot cleanup for Iceberg tables. The retentionConfiguration block controls how many snapshots to keep: snapshotRetentionPeriodInDays sets the time window, numberOfSnapshotsToRetain sets the count limit, and cleanExpiredFiles determines whether to delete data files from expired snapshots. Glue applies whichever limit is reached first.
Remove orphaned files from table storage
Failed writes or incomplete transactions leave files in S3 that table metadata no longer references. These orphaned files consume storage without contributing to queries.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.CatalogTableOptimizer("example", {
catalogId: "123456789012",
databaseName: "example_database",
tableName: "example_table",
configuration: {
roleArn: "arn:aws:iam::123456789012:role/example-role",
enabled: true,
orphanFileDeletionConfiguration: {
icebergConfiguration: {
orphanFileRetentionPeriodInDays: 7,
location: "s3://example-bucket/example_table/",
},
},
},
type: "orphan_file_deletion",
});
import pulumi
import pulumi_aws as aws
example = aws.glue.CatalogTableOptimizer("example",
catalog_id="123456789012",
database_name="example_database",
table_name="example_table",
configuration={
"role_arn": "arn:aws:iam::123456789012:role/example-role",
"enabled": True,
"orphan_file_deletion_configuration": {
"iceberg_configuration": {
"orphan_file_retention_period_in_days": 7,
"location": "s3://example-bucket/example_table/",
},
},
},
type="orphan_file_deletion")
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewCatalogTableOptimizer(ctx, "example", &glue.CatalogTableOptimizerArgs{
CatalogId: pulumi.String("123456789012"),
DatabaseName: pulumi.String("example_database"),
TableName: pulumi.String("example_table"),
Configuration: &glue.CatalogTableOptimizerConfigurationArgs{
RoleArn: pulumi.String("arn:aws:iam::123456789012:role/example-role"),
Enabled: pulumi.Bool(true),
OrphanFileDeletionConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs{
IcebergConfiguration: &glue.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs{
OrphanFileRetentionPeriodInDays: pulumi.Int(7),
Location: pulumi.String("s3://example-bucket/example_table/"),
},
},
},
Type: pulumi.String("orphan_file_deletion"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.CatalogTableOptimizer("example", new()
{
CatalogId = "123456789012",
DatabaseName = "example_database",
TableName = "example_table",
Configuration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationArgs
{
RoleArn = "arn:aws:iam::123456789012:role/example-role",
Enabled = true,
OrphanFileDeletionConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs
{
IcebergConfiguration = new Aws.Glue.Inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs
{
OrphanFileRetentionPeriodInDays = 7,
Location = "s3://example-bucket/example_table/",
},
},
},
Type = "orphan_file_deletion",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.CatalogTableOptimizer;
import com.pulumi.aws.glue.CatalogTableOptimizerArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs;
import com.pulumi.aws.glue.inputs.CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new CatalogTableOptimizer("example", CatalogTableOptimizerArgs.builder()
.catalogId("123456789012")
.databaseName("example_database")
.tableName("example_table")
.configuration(CatalogTableOptimizerConfigurationArgs.builder()
.roleArn("arn:aws:iam::123456789012:role/example-role")
.enabled(true)
.orphanFileDeletionConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationArgs.builder()
.icebergConfiguration(CatalogTableOptimizerConfigurationOrphanFileDeletionConfigurationIcebergConfigurationArgs.builder()
.orphanFileRetentionPeriodInDays(7)
.location("s3://example-bucket/example_table/")
.build())
.build())
.build())
.type("orphan_file_deletion")
.build());
}
}
resources:
example:
type: aws:glue:CatalogTableOptimizer
properties:
catalogId: '123456789012'
databaseName: example_database
tableName: example_table
configuration:
roleArn: arn:aws:iam::123456789012:role/example-role
enabled: true
orphanFileDeletionConfiguration:
icebergConfiguration:
orphanFileRetentionPeriodInDays: 7
location: s3://example-bucket/example_table/
type: orphan_file_deletion
The type property set to “orphan_file_deletion” enables cleanup of unreferenced files. The orphanFileDeletionConfiguration block specifies orphanFileRetentionPeriodInDays (how long to wait before deleting files) and location (the S3 path to scan). Glue identifies files not referenced by table metadata and removes them after the retention period.
Beyond these examples
These snippets focus on specific table optimizer features: compaction for small file optimization, snapshot retention for Iceberg tables, and orphan file cleanup. They’re intentionally minimal rather than complete data lake maintenance solutions.
The examples reference pre-existing infrastructure such as Glue Data Catalog with database and table, IAM role with Glue and S3 permissions, and S3 bucket for table storage (orphan file deletion). They focus on optimizer configuration rather than provisioning the surrounding infrastructure.
To keep things focused, common optimizer patterns are omitted, including:
- Optimizer scheduling and run frequency
- Monitoring optimizer runs and metrics
- Cost estimation for optimizer operations
- Integration with Glue jobs and crawlers
These omissions are intentional: the goal is to illustrate how each optimizer type is wired, not provide drop-in data lake modules. See the Glue Catalog Table Optimizer resource reference for all available configuration options.
Let's configure AWS Glue Catalog Table Optimizer
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Optimizer Types & Configuration
compaction for file compaction, retention for snapshot retention management, and orphan_file_deletion for cleaning up orphaned files.Each type requires different configuration:
- Compaction - Basic config with
roleArnandenabled - Retention - Add
retentionConfigurationwithicebergConfigurationcontainingsnapshotRetentionPeriodInDays,numberOfSnapshotsToRetain, andcleanExpiredFiles - Orphan file deletion - Add
orphanFileDeletionConfigurationwithicebergConfigurationcontainingorphanFileRetentionPeriodInDaysandlocation
enabled boolean in the configuration block to control whether the optimizer runs.IAM & Permissions
roleArn in the configuration block. The role must have permissions appropriate for the optimizer type’s operations.Resource Management
catalog_id,database_name,table_name,type. For example: 123456789012,example_database,example_table,compactionUsing a different cloud?
Explore analytics guides for other cloud providers: