Configure AWS Glue Crawlers

The aws:glue/crawler:Crawler resource, part of the Pulumi AWS provider, defines Glue crawlers that scan data sources, infer schemas, and populate the Glue Data Catalog with table metadata. This guide focuses on three capabilities: targeting S3, DynamoDB, and JDBC data sources, scheduling execution with cron expressions, and configuring table grouping and partition behavior.

Crawlers depend on IAM roles with appropriate permissions, Glue catalog databases to write metadata, and access to the data sources they scan. The examples are intentionally small. Combine them with your own IAM policies, catalog databases, and data infrastructure.

Crawl S3 buckets to discover schemas

Data lakes built on S3 need automated schema discovery to keep Glue Data Catalog tables synchronized with evolving data structures.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    s3Targets: [{
        path: `s3://${exampleAwsS3Bucket.bucket}`,
    }],
});
import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    s3_targets=[{
        "path": f"s3://{example_aws_s3_bucket['bucket']}",
    }])
package main

import (
	"fmt"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			S3Targets: glue.CrawlerS3TargetArray{
				&glue.CrawlerS3TargetArgs{
					Path: pulumi.Sprintf("s3://%v", exampleAwsS3Bucket.Bucket),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        S3Targets = new[]
        {
            new Aws.Glue.Inputs.CrawlerS3TargetArgs
            {
                Path = $"s3://{exampleAwsS3Bucket.Bucket}",
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerS3TargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .s3Targets(CrawlerS3TargetArgs.builder()
                .path(String.format("s3://%s", exampleAwsS3Bucket.bucket()))
                .build())
            .build());

    }
}
resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      s3Targets:
        - path: s3://${exampleAwsS3Bucket.bucket}

When the crawler runs, it scans objects in the S3 path, infers schemas from file formats like Parquet or JSON, and creates or updates tables in the specified database. The s3Targets property defines which bucket paths to scan. The role must grant permissions to read S3 objects and write to the Glue catalog.

Crawl DynamoDB tables for schema inference

Applications storing data in DynamoDB can expose that data to analytics tools by cataloging table schemas in Glue.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    dynamodbTargets: [{
        path: "table-name",
    }],
});
import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    dynamodb_targets=[{
        "path": "table-name",
    }])
package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			DynamodbTargets: glue.CrawlerDynamodbTargetArray{
				&glue.CrawlerDynamodbTargetArgs{
					Path: pulumi.String("table-name"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        DynamodbTargets = new[]
        {
            new Aws.Glue.Inputs.CrawlerDynamodbTargetArgs
            {
                Path = "table-name",
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerDynamodbTargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .dynamodbTargets(CrawlerDynamodbTargetArgs.builder()
                .path("table-name")
                .build())
            .build());

    }
}
resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      dynamodbTargets:
        - path: table-name

The crawler reads DynamoDB table metadata and item samples to infer the schema, then creates a catalog table that tools like Athena can query. The dynamodbTargets property specifies the table name. The role needs permissions to describe and scan the DynamoDB table.

Crawl JDBC databases through Glue connections

Relational databases like RDS or on-premises systems can be cataloged by connecting through JDBC, enabling federated queries across data sources.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    jdbcTargets: [{
        connectionName: exampleAwsGlueConnection.name,
        path: "database-name/%",
    }],
});
import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    jdbc_targets=[{
        "connection_name": example_aws_glue_connection["name"],
        "path": "database-name/%",
    }])
package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			JdbcTargets: glue.CrawlerJdbcTargetArray{
				&glue.CrawlerJdbcTargetArgs{
					ConnectionName: pulumi.Any(exampleAwsGlueConnection.Name),
					Path:           pulumi.String("database-name/%"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        JdbcTargets = new[]
        {
            new Aws.Glue.Inputs.CrawlerJdbcTargetArgs
            {
                ConnectionName = exampleAwsGlueConnection.Name,
                Path = "database-name/%",
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerJdbcTargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .jdbcTargets(CrawlerJdbcTargetArgs.builder()
                .connectionName(exampleAwsGlueConnection.name())
                .path("database-name/%")
                .build())
            .build());

    }
}
resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      jdbcTargets:
        - connectionName: ${exampleAwsGlueConnection.name}
          path: database-name/%

The crawler connects to the database using the specified Glue connection, which contains JDBC URL and credentials. The path property uses % as a wildcard to crawl all tables in the database. The connection must be network-accessible from Glue, typically requiring VPC configuration for RDS.

Schedule crawls with table grouping and partitioning

Production data lakes often need crawlers that run on a schedule and apply intelligent table grouping to reduce catalog fragmentation.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const eventsCrawler = new aws.glue.Crawler("events_crawler", {
    databaseName: glueDatabase.name,
    schedule: "cron(0 1 * * ? *)",
    name: `events_crawler_${environmentName}`,
    role: glueRole.arn,
    tags: tags,
    configuration: JSON.stringify({
        Grouping: {
            TableGroupingPolicy: "CombineCompatibleSchemas",
        },
        CrawlerOutput: {
            Partitions: {
                AddOrUpdateBehavior: "InheritFromTable",
            },
        },
        Version: 1,
    }),
    s3Targets: [{
        path: `s3://${dataLakeBucket.bucket}`,
    }],
});
import pulumi
import json
import pulumi_aws as aws

events_crawler = aws.glue.Crawler("events_crawler",
    database_name=glue_database["name"],
    schedule="cron(0 1 * * ? *)",
    name=f"events_crawler_{environment_name}",
    role=glue_role["arn"],
    tags=tags,
    configuration=json.dumps({
        "Grouping": {
            "TableGroupingPolicy": "CombineCompatibleSchemas",
        },
        "CrawlerOutput": {
            "Partitions": {
                "AddOrUpdateBehavior": "InheritFromTable",
            },
        },
        "Version": 1,
    }),
    s3_targets=[{
        "path": f"s3://{data_lake_bucket['bucket']}",
    }])
package main

import (
	"encoding/json"
	"fmt"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Grouping": map[string]interface{}{
				"TableGroupingPolicy": "CombineCompatibleSchemas",
			},
			"CrawlerOutput": map[string]interface{}{
				"Partitions": map[string]interface{}{
					"AddOrUpdateBehavior": "InheritFromTable",
				},
			},
			"Version": 1,
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		_, err = glue.NewCrawler(ctx, "events_crawler", &glue.CrawlerArgs{
			DatabaseName:  pulumi.Any(glueDatabase.Name),
			Schedule:      pulumi.String("cron(0 1 * * ? *)"),
			Name:          pulumi.Sprintf("events_crawler_%v", environmentName),
			Role:          pulumi.Any(glueRole.Arn),
			Tags:          pulumi.Any(tags),
			Configuration: pulumi.String(json0),
			S3Targets: glue.CrawlerS3TargetArray{
				&glue.CrawlerS3TargetArgs{
					Path: pulumi.Sprintf("s3://%v", dataLakeBucket.Bucket),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}
using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var eventsCrawler = new Aws.Glue.Crawler("events_crawler", new()
    {
        DatabaseName = glueDatabase.Name,
        Schedule = "cron(0 1 * * ? *)",
        Name = $"events_crawler_{environmentName}",
        Role = glueRole.Arn,
        Tags = tags,
        Configuration = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Grouping"] = new Dictionary<string, object?>
            {
                ["TableGroupingPolicy"] = "CombineCompatibleSchemas",
            },
            ["CrawlerOutput"] = new Dictionary<string, object?>
            {
                ["Partitions"] = new Dictionary<string, object?>
                {
                    ["AddOrUpdateBehavior"] = "InheritFromTable",
                },
            },
            ["Version"] = 1,
        }),
        S3Targets = new[]
        {
            new Aws.Glue.Inputs.CrawlerS3TargetArgs
            {
                Path = $"s3://{dataLakeBucket.Bucket}",
            },
        },
    });

});
package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerS3TargetArgs;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var eventsCrawler = new Crawler("eventsCrawler", CrawlerArgs.builder()
            .databaseName(glueDatabase.name())
            .schedule("cron(0 1 * * ? *)")
            .name(String.format("events_crawler_%s", environmentName))
            .role(glueRole.arn())
            .tags(tags)
            .configuration(serializeJson(
                jsonObject(
                    jsonProperty("Grouping", jsonObject(
                        jsonProperty("TableGroupingPolicy", "CombineCompatibleSchemas")
                    )),
                    jsonProperty("CrawlerOutput", jsonObject(
                        jsonProperty("Partitions", jsonObject(
                            jsonProperty("AddOrUpdateBehavior", "InheritFromTable")
                        ))
                    )),
                    jsonProperty("Version", 1)
                )))
            .s3Targets(CrawlerS3TargetArgs.builder()
                .path(String.format("s3://%s", dataLakeBucket.bucket()))
                .build())
            .build());

    }
}
resources:
  eventsCrawler:
    type: aws:glue:Crawler
    name: events_crawler
    properties:
      databaseName: ${glueDatabase.name}
      schedule: cron(0 1 * * ? *)
      name: events_crawler_${environmentName}
      role: ${glueRole.arn}
      tags: ${tags}
      configuration:
        fn::toJSON:
          Grouping:
            TableGroupingPolicy: CombineCompatibleSchemas
          CrawlerOutput:
            Partitions:
              AddOrUpdateBehavior: InheritFromTable
          Version: 1
      s3Targets:
        - path: s3://${dataLakeBucket.bucket}

The schedule property uses cron syntax to run the crawler daily at 1 AM UTC. The configuration JSON controls how the crawler groups compatible schemas into single tables (TableGroupingPolicy) and how it handles partitions (AddOrUpdateBehavior). Setting AddOrUpdateBehavior to “InheritFromTable” preserves existing partition metadata rather than overwriting it on each crawl.

Beyond these examples

These snippets focus on specific crawler features: data source targeting (S3, DynamoDB, JDBC, MongoDB, Catalog), scheduling and configuration, and table grouping and partition handling. They’re intentionally minimal rather than full data catalog solutions.

The examples rely on pre-existing infrastructure such as IAM roles with Glue, S3, DynamoDB, and JDBC permissions, Glue catalog databases, S3 buckets or DynamoDB tables or JDBC-accessible databases, and Glue connections for JDBC/MongoDB targets. They focus on configuring the crawler rather than provisioning the surrounding infrastructure.

To keep things focused, common crawler patterns are omitted, including:

  • Custom classifiers for non-standard formats
  • Schema change policies (deleteBehavior, updateBehavior)
  • Recrawl policies for incremental discovery
  • Lake Formation integration and lineage tracking
  • Security configurations for encryption
  • Table prefix customization

These omissions are intentional: the goal is to illustrate how each crawler feature is wired, not provide drop-in data catalog modules. See the Glue Crawler resource reference for all available configuration options.

Let's configure AWS Glue Crawlers

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Configuration & Setup
What data sources can a Glue Crawler target?
Crawlers support S3 buckets, DynamoDB tables, JDBC databases, MongoDB databases, Glue Catalog tables, Delta Lake, Hudi, and Iceberg. You must specify at least one target type (s3Targets, dynamodbTargets, jdbcTargets, mongodbTargets, catalogTargets, deltaTargets, hudiTargets, or icebergTargets).
What properties can't I change after creating a crawler?
The databaseName and name properties are immutable. Changing either forces resource replacement.
Can I use an IAM role name instead of an ARN for the role property?
Yes, role accepts either a friendly name (including path without leading slash) or an ARN.
What format does the configuration property accept?
A JSON string, not a JavaScript object. Use JSON.stringify() to convert objects, as shown in the configuration settings example.
Scheduling & Crawl Behavior
How do I schedule a crawler to run automatically?
Use the schedule property with a cron expression. For example, cron(0 1 * * ? *) runs daily at 1:00 AM UTC.
How do I configure incremental crawls instead of full dataset scans?
Use recrawlPolicy to specify whether to crawl the entire dataset or only folders added since the last run.
How do I control what happens when the crawler detects schema changes?
Configure schemaChangePolicy to define update and deletion behavior when schemas change.
Advanced Configuration
How do I combine compatible schemas into a single table?
Set TableGroupingPolicy to CombineCompatibleSchemas in the configuration JSON, as shown in the catalog target example.
Can I override the default AWS classifiers?
Yes, use the classifiers property to specify custom classifiers. These always override default AWS classifiers for a given classification.
What does the tablePrefix property do?
It adds a prefix to catalog tables created by the crawler, helping organize tables in the Glue Data Catalog.

Using a different cloud?

Explore analytics guides for other cloud providers: