Configure AWS Glue Crawlers

The aws:glue/crawler:Crawler resource, part of the Pulumi AWS provider, defines Glue crawlers that scan data sources, infer schemas, and populate the Glue Data Catalog with table metadata. This guide focuses on three capabilities: targeting S3, DynamoDB, and JDBC data sources, scheduling and configuration, and schema change detection.

Crawlers depend on a Glue catalog database to write discovered tables, an IAM role with read permissions on data sources and write permissions to the catalog, and the data sources themselves. The examples are intentionally small. Combine them with your own IAM policies, catalog databases, and data source infrastructure.

Crawl S3 buckets to discover table schemas

Data lakes built on S3 need automated schema discovery to keep catalog tables synchronized with evolving data formats. Crawlers scan S3 paths and infer schemas from file contents, making data queryable via Athena and Spark.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    s3Targets: [{
        path: `s3://${exampleAwsS3Bucket.bucket}`,
    }],
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    s3_targets=[{
        "path": f"s3://{example_aws_s3_bucket['bucket']}",
    }])

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			S3Targets: glue.CrawlerS3TargetArray{
				&glue.CrawlerS3TargetArgs{
					Path: pulumi.Sprintf("s3://%v", exampleAwsS3Bucket.Bucket),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        S3Targets = new[]
        {
            new Aws.Glue.Inputs.CrawlerS3TargetArgs
            {
                Path = $"s3://{exampleAwsS3Bucket.Bucket}",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerS3TargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .s3Targets(CrawlerS3TargetArgs.builder()
                .path(String.format("s3://%s", exampleAwsS3Bucket.bucket()))
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      s3Targets:
        - path: s3://${exampleAwsS3Bucket.bucket}

The s3Targets property specifies which S3 paths to scan. The crawler reads files at that location, infers column names and types, and writes table definitions to the specified databaseName. The role must have permissions to read S3 and write to the Glue catalog.

Crawl DynamoDB tables for schema inference

Applications storing data in DynamoDB can expose that data to analytics tools by cataloging table schemas in Glue. Crawlers read DynamoDB table metadata and create corresponding catalog entries.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    dynamodbTargets: [{
        path: "table-name",
    }],
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    dynamodb_targets=[{
        "path": "table-name",
    }])

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			DynamodbTargets: glue.CrawlerDynamodbTargetArray{
				&glue.CrawlerDynamodbTargetArgs{
					Path: pulumi.String("table-name"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        DynamodbTargets = new[]
        {
            new Aws.Glue.Inputs.CrawlerDynamodbTargetArgs
            {
                Path = "table-name",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerDynamodbTargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .dynamodbTargets(CrawlerDynamodbTargetArgs.builder()
                .path("table-name")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      dynamodbTargets:
        - path: table-name

The dynamodbTargets property specifies the DynamoDB table name to crawl. The crawler extracts the table’s attribute definitions and creates a catalog table in the specified database. This makes DynamoDB data queryable via Athena or EMR without moving the data.

Crawl relational databases via JDBC connections

Teams migrating from traditional databases to data lakes often need to catalog existing RDS or on-premises database schemas. JDBC crawlers connect to databases and extract table metadata.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    jdbcTargets: [{
        connectionName: exampleAwsGlueConnection.name,
        path: "database-name/%",
    }],
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    jdbc_targets=[{
        "connection_name": example_aws_glue_connection["name"],
        "path": "database-name/%",
    }])

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			JdbcTargets: glue.CrawlerJdbcTargetArray{
				&glue.CrawlerJdbcTargetArgs{
					ConnectionName: pulumi.Any(exampleAwsGlueConnection.Name),
					Path:           pulumi.String("database-name/%"),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        JdbcTargets = new[]
        {
            new Aws.Glue.Inputs.CrawlerJdbcTargetArgs
            {
                ConnectionName = exampleAwsGlueConnection.Name,
                Path = "database-name/%",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerJdbcTargetArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .jdbcTargets(CrawlerJdbcTargetArgs.builder()
                .connectionName(exampleAwsGlueConnection.name())
                .path("database-name/%")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      jdbcTargets:
        - connectionName: ${exampleAwsGlueConnection.name}
          path: database-name/%

The jdbcTargets property references a Glue connection that contains JDBC connection details. The path property uses a pattern (database-name/%) to specify which database and tables to crawl. The crawler connects via JDBC, reads table schemas, and writes them to the Glue catalog.

Schedule crawls and configure table grouping

Production data lakes run crawlers on schedules to keep catalogs current as new data arrives. Configuration options control how crawlers group compatible schemas and handle partitions.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const eventsCrawler = new aws.glue.Crawler("events_crawler", {
    databaseName: glueDatabase.name,
    schedule: "cron(0 1 * * ? *)",
    name: `events_crawler_${environmentName}`,
    role: glueRole.arn,
    tags: tags,
    configuration: JSON.stringify({
        Grouping: {
            TableGroupingPolicy: "CombineCompatibleSchemas",
        },
        CrawlerOutput: {
            Partitions: {
                AddOrUpdateBehavior: "InheritFromTable",
            },
        },
        Version: 1,
    }),
    s3Targets: [{
        path: `s3://${dataLakeBucket.bucket}`,
    }],
});

import pulumi
import json
import pulumi_aws as aws

events_crawler = aws.glue.Crawler("events_crawler",
    database_name=glue_database["name"],
    schedule="cron(0 1 * * ? *)",
    name=f"events_crawler_{environment_name}",
    role=glue_role["arn"],
    tags=tags,
    configuration=json.dumps({
        "Grouping": {
            "TableGroupingPolicy": "CombineCompatibleSchemas",
        },
        "CrawlerOutput": {
            "Partitions": {
                "AddOrUpdateBehavior": "InheritFromTable",
            },
        },
        "Version": 1,
    }),
    s3_targets=[{
        "path": f"s3://{data_lake_bucket['bucket']}",
    }])

package main

import (
	"encoding/json"

	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		tmpJSON0, err := json.Marshal(map[string]interface{}{
			"Grouping": map[string]interface{}{
				"TableGroupingPolicy": "CombineCompatibleSchemas",
			},
			"CrawlerOutput": map[string]interface{}{
				"Partitions": map[string]interface{}{
					"AddOrUpdateBehavior": "InheritFromTable",
				},
			},
			"Version": 1,
		})
		if err != nil {
			return err
		}
		json0 := string(tmpJSON0)
		_, err = glue.NewCrawler(ctx, "events_crawler", &glue.CrawlerArgs{
			DatabaseName:  pulumi.Any(glueDatabase.Name),
			Schedule:      pulumi.String("cron(0 1 * * ? *)"),
			Name:          pulumi.Sprintf("events_crawler_%v", environmentName),
			Role:          pulumi.Any(glueRole.Arn),
			Tags:          pulumi.Any(tags),
			Configuration: pulumi.String(json0),
			S3Targets: glue.CrawlerS3TargetArray{
				&glue.CrawlerS3TargetArgs{
					Path: pulumi.Sprintf("s3://%v", dataLakeBucket.Bucket),
				},
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using System.Text.Json;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var eventsCrawler = new Aws.Glue.Crawler("events_crawler", new()
    {
        DatabaseName = glueDatabase.Name,
        Schedule = "cron(0 1 * * ? *)",
        Name = $"events_crawler_{environmentName}",
        Role = glueRole.Arn,
        Tags = tags,
        Configuration = JsonSerializer.Serialize(new Dictionary<string, object?>
        {
            ["Grouping"] = new Dictionary<string, object?>
            {
                ["TableGroupingPolicy"] = "CombineCompatibleSchemas",
            },
            ["CrawlerOutput"] = new Dictionary<string, object?>
            {
                ["Partitions"] = new Dictionary<string, object?>
                {
                    ["AddOrUpdateBehavior"] = "InheritFromTable",
                },
            },
            ["Version"] = 1,
        }),
        S3Targets = new[]
        {
            new Aws.Glue.Inputs.CrawlerS3TargetArgs
            {
                Path = $"s3://{dataLakeBucket.Bucket}",
            },
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerS3TargetArgs;
import static com.pulumi.codegen.internal.Serialization.*;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var eventsCrawler = new Crawler("eventsCrawler", CrawlerArgs.builder()
            .databaseName(glueDatabase.name())
            .schedule("cron(0 1 * * ? *)")
            .name(String.format("events_crawler_%s", environmentName))
            .role(glueRole.arn())
            .tags(tags)
            .configuration(serializeJson(
                jsonObject(
                    jsonProperty("Grouping", jsonObject(
                        jsonProperty("TableGroupingPolicy", "CombineCompatibleSchemas")
                    )),
                    jsonProperty("CrawlerOutput", jsonObject(
                        jsonProperty("Partitions", jsonObject(
                            jsonProperty("AddOrUpdateBehavior", "InheritFromTable")
                        ))
                    )),
                    jsonProperty("Version", 1)
                )))
            .s3Targets(CrawlerS3TargetArgs.builder()
                .path(String.format("s3://%s", dataLakeBucket.bucket()))
                .build())
            .build());

    }
}

resources:
  eventsCrawler:
    type: aws:glue:Crawler
    name: events_crawler
    properties:
      databaseName: ${glueDatabase.name}
      schedule: cron(0 1 * * ? *)
      name: events_crawler_${environmentName}
      role: ${glueRole.arn}
      tags: ${tags}
      configuration:
        fn::toJSON:
          Grouping:
            TableGroupingPolicy: CombineCompatibleSchemas
          CrawlerOutput:
            Partitions:
              AddOrUpdateBehavior: InheritFromTable
          Version: 1
      s3Targets:
        - path: s3://${dataLakeBucket.bucket}

The schedule property uses cron syntax to run the crawler automatically (here, daily at 1 AM UTC). The configuration property is a JSON string that controls crawler behavior: TableGroupingPolicy combines tables with compatible schemas, and AddOrUpdateBehavior determines how partitions are handled. This extends the basic S3 example with automation and advanced schema management.

Crawl existing catalog tables for schema changes

When catalog tables already exist, crawlers can monitor them for schema evolution rather than scanning raw data sources. This is useful for tracking changes in derived or transformed tables.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Crawler("example", {
    databaseName: exampleAwsGlueCatalogDatabase.name,
    name: "example",
    role: exampleAwsIamRole.arn,
    catalogTargets: [{
        databaseName: exampleAwsGlueCatalogDatabase.name,
        tables: [exampleAwsGlueCatalogTable.name],
    }],
    schemaChangePolicy: {
        deleteBehavior: "LOG",
    },
    configuration: `{
  \\"Version\\":1.0,
  \\"Grouping\\": {
    \\"TableGroupingPolicy\\": \\"CombineCompatibleSchemas\\"
  }
}
`,
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Crawler("example",
    database_name=example_aws_glue_catalog_database["name"],
    name="example",
    role=example_aws_iam_role["arn"],
    catalog_targets=[{
        "database_name": example_aws_glue_catalog_database["name"],
        "tables": [example_aws_glue_catalog_table["name"]],
    }],
    schema_change_policy={
        "delete_behavior": "LOG",
    },
    configuration="""{
  \"Version\":1.0,
  \"Grouping\": {
    \"TableGroupingPolicy\": \"CombineCompatibleSchemas\"
  }
}
""")

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewCrawler(ctx, "example", &glue.CrawlerArgs{
			DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
			Name:         pulumi.String("example"),
			Role:         pulumi.Any(exampleAwsIamRole.Arn),
			CatalogTargets: glue.CrawlerCatalogTargetArray{
				&glue.CrawlerCatalogTargetArgs{
					DatabaseName: pulumi.Any(exampleAwsGlueCatalogDatabase.Name),
					Tables: pulumi.StringArray{
						exampleAwsGlueCatalogTable.Name,
					},
				},
			},
			SchemaChangePolicy: &glue.CrawlerSchemaChangePolicyArgs{
				DeleteBehavior: pulumi.String("LOG"),
			},
			Configuration: pulumi.String(`{
  \"Version\":1.0,
  \"Grouping\": {
    \"TableGroupingPolicy\": \"CombineCompatibleSchemas\"
  }
}
`),
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Crawler("example", new()
    {
        DatabaseName = exampleAwsGlueCatalogDatabase.Name,
        Name = "example",
        Role = exampleAwsIamRole.Arn,
        CatalogTargets = new[]
        {
            new Aws.Glue.Inputs.CrawlerCatalogTargetArgs
            {
                DatabaseName = exampleAwsGlueCatalogDatabase.Name,
                Tables = new[]
                {
                    exampleAwsGlueCatalogTable.Name,
                },
            },
        },
        SchemaChangePolicy = new Aws.Glue.Inputs.CrawlerSchemaChangePolicyArgs
        {
            DeleteBehavior = "LOG",
        },
        Configuration = @"{
  \""Version\"":1.0,
  \""Grouping\"": {
    \""TableGroupingPolicy\"": \""CombineCompatibleSchemas\""
  }
}
",
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Crawler;
import com.pulumi.aws.glue.CrawlerArgs;
import com.pulumi.aws.glue.inputs.CrawlerCatalogTargetArgs;
import com.pulumi.aws.glue.inputs.CrawlerSchemaChangePolicyArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Crawler("example", CrawlerArgs.builder()
            .databaseName(exampleAwsGlueCatalogDatabase.name())
            .name("example")
            .role(exampleAwsIamRole.arn())
            .catalogTargets(CrawlerCatalogTargetArgs.builder()
                .databaseName(exampleAwsGlueCatalogDatabase.name())
                .tables(exampleAwsGlueCatalogTable.name())
                .build())
            .schemaChangePolicy(CrawlerSchemaChangePolicyArgs.builder()
                .deleteBehavior("LOG")
                .build())
            .configuration("""
{
  \"Version\":1.0,
  \"Grouping\": {
    \"TableGroupingPolicy\": \"CombineCompatibleSchemas\"
  }
}
            """)
            .build());

    }
}

resources:
  example:
    type: aws:glue:Crawler
    properties:
      databaseName: ${exampleAwsGlueCatalogDatabase.name}
      name: example
      role: ${exampleAwsIamRole.arn}
      catalogTargets:
        - databaseName: ${exampleAwsGlueCatalogDatabase.name}
          tables:
            - ${exampleAwsGlueCatalogTable.name}
      schemaChangePolicy:
        deleteBehavior: LOG
      configuration: |
        {
          \"Version\":1.0,
          \"Grouping\": {
            \"TableGroupingPolicy\": \"CombineCompatibleSchemas\"
          }
        }

The catalogTargets property specifies existing catalog tables to monitor. The schemaChangePolicy controls what happens when schemas change (here, LOG records changes without modifying tables). The configuration enables schema grouping for tables with compatible structures. This approach detects schema drift in existing catalog tables without re-scanning underlying data.

Beyond these examples

These snippets focus on specific crawler-level features: data source targeting (S3, DynamoDB, JDBC, MongoDB, catalog tables), scheduling and configuration, and schema change policies. They’re intentionally minimal rather than full data catalog solutions.

The examples reference pre-existing infrastructure such as Glue catalog databases, IAM roles with appropriate permissions, and data sources (S3 buckets, DynamoDB tables, JDBC connections). They focus on configuring the crawler rather than provisioning the surrounding infrastructure.

To keep things focused, common crawler patterns are omitted, including:

Custom classifiers for non-standard formats
Recrawl policies for incremental discovery
Lake Formation integration (lakeFormationConfiguration)
Security configurations for encryption
Lineage tracking (lineageConfiguration)
Table prefixes and exclusion patterns

These omissions are intentional: the goal is to illustrate how each crawler feature is wired, not provide drop-in catalog modules. See the Glue Crawler resource reference for all available configuration options.

Let's configure AWS Glue Crawlers

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Configuration & Setup

What data sources can a Glue Crawler target?

Crawlers support DynamoDB, JDBC databases, S3 buckets, MongoDB, Data Catalog tables, Delta Lake, Hudi, and Iceberg targets. You must configure at least one target type.

Can I change the database or crawler name after creation?

No, both databaseName and name are immutable. Changing either property forces replacement of the crawler.

What does the path wildcard mean in JDBC and MongoDB targets?

The % wildcard in paths like database-name/% tells the crawler to include all tables or collections within that database.

Scheduling & Execution

How do I schedule my crawler to run automatically?

Use the schedule property with a cron expression. For example, cron(0 1 * * ? *) runs daily at 1:00 AM UTC.

How do I crawl only new folders instead of the entire dataset?

Configure recrawlPolicy to specify whether the crawler should process only folders added since the last run or recrawl everything.

Advanced Configuration

How do I configure table grouping for compatible schemas?

Set the configuration property with JSON containing TableGroupingPolicy: "CombineCompatibleSchemas" in the Grouping section.

Can I use custom classifiers instead of AWS defaults?

Yes, specify custom classifiers in the classifiers property. These always override the default AWS classifiers for a given classification.

How do I control partition behavior for crawled tables?

Use the configuration JSON with CrawlerOutput.Partitions.AddOrUpdateBehavior set to values like InheritFromTable.

Using a different cloud?

Explore analytics guides for other cloud providers:

Azure Guides GCP Guides