The aws:kendra/dataSource:DataSource resource, part of the Pulumi AWS provider, connects a Kendra index to document repositories such as S3 buckets, websites, databases, or custom ingestion pipelines. This guide focuses on three capabilities: S3 connector with scheduling and filtering, web crawler with seed URLs and scope controls, and access control and metadata configuration.
Data sources depend on a Kendra index, IAM roles with read permissions for the source repository, and the repository itself. The examples are intentionally small. Combine them with your own index, IAM policies, and repository configuration.
Create a custom data source with manual sync
Teams building custom ingestion pipelines often use CUSTOM data sources that accept documents through the API rather than connecting to a repository.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
description: "example",
languageCode: "en",
type: "CUSTOM",
tags: {
hello: "world",
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
description="example",
language_code="en",
type="CUSTOM",
tags={
"hello": "world",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Description: pulumi.String("example"),
LanguageCode: pulumi.String("en"),
Type: pulumi.String("CUSTOM"),
Tags: pulumi.StringMap{
"hello": pulumi.String("world"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Description = "example",
LanguageCode = "en",
Type = "CUSTOM",
Tags =
{
{ "hello", "world" },
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.description("example")
.languageCode("en")
.type("CUSTOM")
.tags(Map.of("hello", "world"))
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
description: example
languageCode: en
type: CUSTOM
tags:
hello: world
The type property set to CUSTOM indicates programmatic document ingestion. Without a schedule property, you control sync timing by calling the StartDataSourceSyncJob API. The languageCode property determines text analysis behavior for all documents.
Index S3 documents on a schedule
Document repositories in S3 need periodic synchronization to keep the search index current as files are added or updated.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "S3",
roleArn: exampleAwsIamRole.arn,
schedule: "cron(9 10 1 * ? *)",
configuration: {
s3Configuration: {
bucketName: exampleAwsS3Bucket.id,
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="S3",
role_arn=example_aws_iam_role["arn"],
schedule="cron(9 10 1 * ? *)",
configuration={
"s3_configuration": {
"bucket_name": example_aws_s3_bucket["id"],
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("S3"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Schedule: pulumi.String("cron(9 10 1 * ? *)"),
Configuration: &kendra.DataSourceConfigurationArgs{
S3Configuration: &kendra.DataSourceConfigurationS3ConfigurationArgs{
BucketName: pulumi.Any(exampleAwsS3Bucket.Id),
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "S3",
RoleArn = exampleAwsIamRole.Arn,
Schedule = "cron(9 10 1 * ? *)",
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
S3Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationS3ConfigurationArgs
{
BucketName = exampleAwsS3Bucket.Id,
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationS3ConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("S3")
.roleArn(exampleAwsIamRole.arn())
.schedule("cron(9 10 1 * ? *)")
.configuration(DataSourceConfigurationArgs.builder()
.s3Configuration(DataSourceConfigurationS3ConfigurationArgs.builder()
.bucketName(exampleAwsS3Bucket.id())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: S3
roleArn: ${exampleAwsIamRole.arn}
schedule: cron(9 10 1 * ? *)
configuration:
s3Configuration:
bucketName: ${exampleAwsS3Bucket.id}
The schedule property uses cron syntax to trigger automatic syncs. The s3Configuration block specifies the bucket to index. The roleArn grants Kendra permission to read S3 objects and write to CloudWatch Logs.
Apply document-level access control from S3
Enterprise search often requires document-level permissions that restrict which users can see specific results based on access control lists.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "S3",
roleArn: exampleAwsIamRole.arn,
configuration: {
s3Configuration: {
bucketName: exampleAwsS3Bucket.id,
accessControlListConfiguration: {
keyPath: `s3://${exampleAwsS3Bucket.id}/path-1`,
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="S3",
role_arn=example_aws_iam_role["arn"],
configuration={
"s3_configuration": {
"bucket_name": example_aws_s3_bucket["id"],
"access_control_list_configuration": {
"key_path": f"s3://{example_aws_s3_bucket['id']}/path-1",
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("S3"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
S3Configuration: &kendra.DataSourceConfigurationS3ConfigurationArgs{
BucketName: pulumi.Any(exampleAwsS3Bucket.Id),
AccessControlListConfiguration: &kendra.DataSourceConfigurationS3ConfigurationAccessControlListConfigurationArgs{
KeyPath: pulumi.Sprintf("s3://%v/path-1", exampleAwsS3Bucket.Id),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "S3",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
S3Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationS3ConfigurationArgs
{
BucketName = exampleAwsS3Bucket.Id,
AccessControlListConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationS3ConfigurationAccessControlListConfigurationArgs
{
KeyPath = $"s3://{exampleAwsS3Bucket.Id}/path-1",
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationS3ConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationS3ConfigurationAccessControlListConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("S3")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.s3Configuration(DataSourceConfigurationS3ConfigurationArgs.builder()
.bucketName(exampleAwsS3Bucket.id())
.accessControlListConfiguration(DataSourceConfigurationS3ConfigurationAccessControlListConfigurationArgs.builder()
.keyPath(String.format("s3://%s/path-1", exampleAwsS3Bucket.id()))
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: S3
roleArn: ${exampleAwsIamRole.arn}
configuration:
s3Configuration:
bucketName: ${exampleAwsS3Bucket.id}
accessControlListConfiguration:
keyPath: s3://${exampleAwsS3Bucket.id}/path-1
The accessControlListConfiguration property points to JSON files in S3 that define per-document access rules. Kendra reads these ACL files during indexing and filters search results based on the requesting user’s identity. The keyPath must reference a valid S3 location containing ACL files in Kendra’s expected format.
Filter S3 documents with patterns and metadata
Large S3 buckets often contain mixed content where only specific files or directories should be indexed.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "S3",
roleArn: exampleAwsIamRole.arn,
configuration: {
s3Configuration: {
bucketName: exampleAwsS3Bucket.id,
exclusionPatterns: ["example"],
inclusionPatterns: ["hello"],
inclusionPrefixes: ["world"],
documentsMetadataConfiguration: {
s3Prefix: "example",
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="S3",
role_arn=example_aws_iam_role["arn"],
configuration={
"s3_configuration": {
"bucket_name": example_aws_s3_bucket["id"],
"exclusion_patterns": ["example"],
"inclusion_patterns": ["hello"],
"inclusion_prefixes": ["world"],
"documents_metadata_configuration": {
"s3_prefix": "example",
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("S3"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
S3Configuration: &kendra.DataSourceConfigurationS3ConfigurationArgs{
BucketName: pulumi.Any(exampleAwsS3Bucket.Id),
ExclusionPatterns: pulumi.StringArray{
pulumi.String("example"),
},
InclusionPatterns: pulumi.StringArray{
pulumi.String("hello"),
},
InclusionPrefixes: pulumi.StringArray{
pulumi.String("world"),
},
DocumentsMetadataConfiguration: &kendra.DataSourceConfigurationS3ConfigurationDocumentsMetadataConfigurationArgs{
S3Prefix: pulumi.String("example"),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "S3",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
S3Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationS3ConfigurationArgs
{
BucketName = exampleAwsS3Bucket.Id,
ExclusionPatterns = new[]
{
"example",
},
InclusionPatterns = new[]
{
"hello",
},
InclusionPrefixes = new[]
{
"world",
},
DocumentsMetadataConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationS3ConfigurationDocumentsMetadataConfigurationArgs
{
S3Prefix = "example",
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationS3ConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationS3ConfigurationDocumentsMetadataConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("S3")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.s3Configuration(DataSourceConfigurationS3ConfigurationArgs.builder()
.bucketName(exampleAwsS3Bucket.id())
.exclusionPatterns("example")
.inclusionPatterns("hello")
.inclusionPrefixes("world")
.documentsMetadataConfiguration(DataSourceConfigurationS3ConfigurationDocumentsMetadataConfigurationArgs.builder()
.s3Prefix("example")
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: S3
roleArn: ${exampleAwsIamRole.arn}
configuration:
s3Configuration:
bucketName: ${exampleAwsS3Bucket.id}
exclusionPatterns:
- example
inclusionPatterns:
- hello
inclusionPrefixes:
- world
documentsMetadataConfiguration:
s3Prefix: example
The inclusionPatterns and exclusionPatterns properties filter which files to index based on glob patterns. The inclusionPrefixes property limits indexing to specific S3 prefixes. The documentsMetadataConfiguration block specifies where to find metadata files that enrich indexed documents with custom attributes.
Crawl websites starting from seed URLs
Public documentation sites and knowledge bases can be indexed by providing starting URLs that Kendra crawls to discover linked pages.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "WEBCRAWLER",
roleArn: exampleAwsIamRole.arn,
configuration: {
webCrawlerConfiguration: {
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="WEBCRAWLER",
role_arn=example_aws_iam_role["arn"],
configuration={
"web_crawler_configuration": {
"urls": {
"seed_url_configuration": {
"seed_urls": ["REPLACE_WITH_YOUR_URL"],
},
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("WEBCRAWLER"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
WebCrawlerConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationArgs{
Urls: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs{
SeedUrlConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs{
SeedUrls: pulumi.StringArray{
pulumi.String("REPLACE_WITH_YOUR_URL"),
},
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "WEBCRAWLER",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
WebCrawlerConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationArgs
{
Urls = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs
{
SeedUrlConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs
{
SeedUrls = new[]
{
"REPLACE_WITH_YOUR_URL",
},
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("WEBCRAWLER")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.webCrawlerConfiguration(DataSourceConfigurationWebCrawlerConfigurationArgs.builder()
.urls(DataSourceConfigurationWebCrawlerConfigurationUrlsArgs.builder()
.seedUrlConfiguration(DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs.builder()
.seedUrls("REPLACE_WITH_YOUR_URL")
.build())
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: WEBCRAWLER
roleArn: ${exampleAwsIamRole.arn}
configuration:
webCrawlerConfiguration:
urls:
seedUrlConfiguration:
seedUrls:
- REPLACE_WITH_YOUR_URL
The webCrawlerConfiguration block defines crawling behavior. The seedUrls array provides starting points; Kendra follows links from these pages to discover additional content. You must replace the placeholder URL with your actual target.
Control crawl scope with subdomain mode
Documentation sites often span multiple subdomains that should be indexed together.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "WEBCRAWLER",
roleArn: exampleAwsIamRole.arn,
configuration: {
webCrawlerConfiguration: {
urls: {
seedUrlConfiguration: {
webCrawlerMode: "SUBDOMAINS",
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="WEBCRAWLER",
role_arn=example_aws_iam_role["arn"],
configuration={
"web_crawler_configuration": {
"urls": {
"seed_url_configuration": {
"web_crawler_mode": "SUBDOMAINS",
"seed_urls": ["REPLACE_WITH_YOUR_URL"],
},
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("WEBCRAWLER"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
WebCrawlerConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationArgs{
Urls: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs{
SeedUrlConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs{
WebCrawlerMode: pulumi.String("SUBDOMAINS"),
SeedUrls: pulumi.StringArray{
pulumi.String("REPLACE_WITH_YOUR_URL"),
},
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "WEBCRAWLER",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
WebCrawlerConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationArgs
{
Urls = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs
{
SeedUrlConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs
{
WebCrawlerMode = "SUBDOMAINS",
SeedUrls = new[]
{
"REPLACE_WITH_YOUR_URL",
},
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("WEBCRAWLER")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.webCrawlerConfiguration(DataSourceConfigurationWebCrawlerConfigurationArgs.builder()
.urls(DataSourceConfigurationWebCrawlerConfigurationUrlsArgs.builder()
.seedUrlConfiguration(DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs.builder()
.webCrawlerMode("SUBDOMAINS")
.seedUrls("REPLACE_WITH_YOUR_URL")
.build())
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: WEBCRAWLER
roleArn: ${exampleAwsIamRole.arn}
configuration:
webCrawlerConfiguration:
urls:
seedUrlConfiguration:
webCrawlerMode: SUBDOMAINS
seedUrls:
- REPLACE_WITH_YOUR_URL
The webCrawlerMode property set to SUBDOMAINS allows the crawler to follow links to any subdomain of the seed URL’s domain. Without this setting, the crawler stays within the exact host specified in the seed URL.
Limit crawl depth to control scope
Deep website hierarchies can lead to excessive crawling; depth limits prevent the crawler from following links beyond a specified distance from seed URLs.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "WEBCRAWLER",
roleArn: exampleAwsIamRole.arn,
configuration: {
webCrawlerConfiguration: {
crawlDepth: 3,
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="WEBCRAWLER",
role_arn=example_aws_iam_role["arn"],
configuration={
"web_crawler_configuration": {
"crawl_depth": 3,
"urls": {
"seed_url_configuration": {
"seed_urls": ["REPLACE_WITH_YOUR_URL"],
},
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("WEBCRAWLER"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
WebCrawlerConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationArgs{
CrawlDepth: pulumi.Int(3),
Urls: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs{
SeedUrlConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs{
SeedUrls: pulumi.StringArray{
pulumi.String("REPLACE_WITH_YOUR_URL"),
},
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "WEBCRAWLER",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
WebCrawlerConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationArgs
{
CrawlDepth = 3,
Urls = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs
{
SeedUrlConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs
{
SeedUrls = new[]
{
"REPLACE_WITH_YOUR_URL",
},
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("WEBCRAWLER")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.webCrawlerConfiguration(DataSourceConfigurationWebCrawlerConfigurationArgs.builder()
.crawlDepth(3)
.urls(DataSourceConfigurationWebCrawlerConfigurationUrlsArgs.builder()
.seedUrlConfiguration(DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs.builder()
.seedUrls("REPLACE_WITH_YOUR_URL")
.build())
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: WEBCRAWLER
roleArn: ${exampleAwsIamRole.arn}
configuration:
webCrawlerConfiguration:
crawlDepth: 3
urls:
seedUrlConfiguration:
seedUrls:
- REPLACE_WITH_YOUR_URL
The crawlDepth property limits how many link hops the crawler follows from seed URLs. A depth of 3 means the crawler indexes the seed page, pages linked from the seed, pages linked from those pages, and one more level.
Filter crawled pages with URL patterns
Websites often contain sections that shouldn’t be indexed, such as admin pages or archived content.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.kendra.DataSource("example", {
indexId: exampleAwsKendraIndex.id,
name: "example",
type: "WEBCRAWLER",
roleArn: exampleAwsIamRole.arn,
configuration: {
webCrawlerConfiguration: {
urlExclusionPatterns: ["example"],
urlInclusionPatterns: ["hello"],
urls: {
seedUrlConfiguration: {
seedUrls: ["REPLACE_WITH_YOUR_URL"],
},
},
},
},
});
import pulumi
import pulumi_aws as aws
example = aws.kendra.DataSource("example",
index_id=example_aws_kendra_index["id"],
name="example",
type="WEBCRAWLER",
role_arn=example_aws_iam_role["arn"],
configuration={
"web_crawler_configuration": {
"url_exclusion_patterns": ["example"],
"url_inclusion_patterns": ["hello"],
"urls": {
"seed_url_configuration": {
"seed_urls": ["REPLACE_WITH_YOUR_URL"],
},
},
},
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/kendra"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := kendra.NewDataSource(ctx, "example", &kendra.DataSourceArgs{
IndexId: pulumi.Any(exampleAwsKendraIndex.Id),
Name: pulumi.String("example"),
Type: pulumi.String("WEBCRAWLER"),
RoleArn: pulumi.Any(exampleAwsIamRole.Arn),
Configuration: &kendra.DataSourceConfigurationArgs{
WebCrawlerConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationArgs{
UrlExclusionPatterns: pulumi.StringArray{
pulumi.String("example"),
},
UrlInclusionPatterns: pulumi.StringArray{
pulumi.String("hello"),
},
Urls: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs{
SeedUrlConfiguration: &kendra.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs{
SeedUrls: pulumi.StringArray{
pulumi.String("REPLACE_WITH_YOUR_URL"),
},
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Kendra.DataSource("example", new()
{
IndexId = exampleAwsKendraIndex.Id,
Name = "example",
Type = "WEBCRAWLER",
RoleArn = exampleAwsIamRole.Arn,
Configuration = new Aws.Kendra.Inputs.DataSourceConfigurationArgs
{
WebCrawlerConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationArgs
{
UrlExclusionPatterns = new[]
{
"example",
},
UrlInclusionPatterns = new[]
{
"hello",
},
Urls = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs
{
SeedUrlConfiguration = new Aws.Kendra.Inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs
{
SeedUrls = new[]
{
"REPLACE_WITH_YOUR_URL",
},
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.kendra.DataSource;
import com.pulumi.aws.kendra.DataSourceArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsArgs;
import com.pulumi.aws.kendra.inputs.DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new DataSource("example", DataSourceArgs.builder()
.indexId(exampleAwsKendraIndex.id())
.name("example")
.type("WEBCRAWLER")
.roleArn(exampleAwsIamRole.arn())
.configuration(DataSourceConfigurationArgs.builder()
.webCrawlerConfiguration(DataSourceConfigurationWebCrawlerConfigurationArgs.builder()
.urlExclusionPatterns("example")
.urlInclusionPatterns("hello")
.urls(DataSourceConfigurationWebCrawlerConfigurationUrlsArgs.builder()
.seedUrlConfiguration(DataSourceConfigurationWebCrawlerConfigurationUrlsSeedUrlConfigurationArgs.builder()
.seedUrls("REPLACE_WITH_YOUR_URL")
.build())
.build())
.build())
.build())
.build());
}
}
resources:
example:
type: aws:kendra:DataSource
properties:
indexId: ${exampleAwsKendraIndex.id}
name: example
type: WEBCRAWLER
roleArn: ${exampleAwsIamRole.arn}
configuration:
webCrawlerConfiguration:
urlExclusionPatterns:
- example
urlInclusionPatterns:
- hello
urls:
seedUrlConfiguration:
seedUrls:
- REPLACE_WITH_YOUR_URL
The urlInclusionPatterns and urlExclusionPatterns properties filter discovered URLs before crawling. Patterns use regular expressions; exclusion patterns take precedence over inclusion patterns when both match.
Beyond these examples
These snippets focus on specific data source features: S3 and web crawler connectors, access control and metadata filtering, and crawl scope and rate controls. They’re intentionally minimal rather than full search implementations.
The examples reference pre-existing infrastructure such as Kendra indexes, IAM roles with data source permissions, S3 buckets for S3 connector, and Secrets Manager secrets for authenticated crawling. They focus on configuring the data source rather than provisioning the surrounding infrastructure.
To keep things focused, common data source patterns are omitted, including:
- Custom document enrichment (customDocumentEnrichmentConfiguration)
- Site map crawling (siteMapsConfiguration)
- Proxy and authentication for web crawlers
- Template-based connectors (WEBCRAWLERV2)
- Database and SaaS connectors (Salesforce, ServiceNow, SharePoint)
- Sync job management and error handling
These omissions are intentional: the goal is to illustrate how each data source feature is wired, not provide drop-in search modules. See the Kendra DataSource resource reference for all available configuration options.
Let's configure AWS Kendra Data Sources
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Data Source Types & Configuration
configuration or roleArn parameters. These parameters are only for connector types like S3, WEBCRAWLER, or TEMPLATE.CUSTOM, S3, WEBCRAWLER, and TEMPLATE. For the complete list, see the AWS documentation on valid values for Type.roleArn is required for all data source types except CUSTOM. The IAM role must have permissions to access your data source repository.Syncing & Updates
schedule with a cron expression for automatic periodic syncing. Without a schedule, Kendra won’t automatically update the index—you’ll need to manually call the StartDataSourceSyncJob API.cron(9 10 1 * ? *) shown in the schedule example.Immutability & Lifecycle
indexId and type are immutable. Changing either requires recreating the data source.Authentication & Dependencies
dependsOn for the secret version resource to ensure proper creation order.proxyConfiguration with credentials (Secrets Manager ARN), host, and port. Include dependsOn for the secret version.Troubleshooting
status output property. When status is FAILED, the errorMessage field contains the reason for failure.ACTIVE status means the data source is ready to use and can sync documents to the index.Using a different cloud?
Explore analytics guides for other cloud providers: