Configure AWS Glue Classifiers

The aws:glue/classifier:Classifier resource, part of the Pulumi AWS provider, defines custom data format detection rules that Glue crawlers use to infer schema from files. This guide focuses on three capabilities: CSV delimiter and header configuration, grok pattern matching for logs, and JSONPath and XML row tag extraction.

Classifiers work with Glue crawlers to detect file structure and create catalog tables. Each classifier handles one format type; changing types recreates the classifier. The examples are intentionally small. Reference them from your crawler configurations to enable custom format detection.

Parse CSV files with custom delimiters and headers

Data lakes often contain CSV files with varying formats: different delimiters, quoted fields, or custom header rows.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Classifier("example", {
    name: "example",
    csvClassifier: {
        allowSingleColumn: false,
        containsHeader: "PRESENT",
        delimiter: ",",
        disableValueTrimming: false,
        headers: [
            "example1",
            "example2",
        ],
        quoteSymbol: "'",
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Classifier("example",
    name="example",
    csv_classifier={
        "allow_single_column": False,
        "contains_header": "PRESENT",
        "delimiter": ",",
        "disable_value_trimming": False,
        "headers": [
            "example1",
            "example2",
        ],
        "quote_symbol": "'",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
			Name: pulumi.String("example"),
			CsvClassifier: &glue.ClassifierCsvClassifierArgs{
				AllowSingleColumn:    pulumi.Bool(false),
				ContainsHeader:       pulumi.String("PRESENT"),
				Delimiter:            pulumi.String(","),
				DisableValueTrimming: pulumi.Bool(false),
				Headers: pulumi.StringArray{
					pulumi.String("example1"),
					pulumi.String("example2"),
				},
				QuoteSymbol: pulumi.String("'"),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Classifier("example", new()
    {
        Name = "example",
        CsvClassifier = new Aws.Glue.Inputs.ClassifierCsvClassifierArgs
        {
            AllowSingleColumn = false,
            ContainsHeader = "PRESENT",
            Delimiter = ",",
            DisableValueTrimming = false,
            Headers = new[]
            {
                "example1",
                "example2",
            },
            QuoteSymbol = "'",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierCsvClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Classifier("example", ClassifierArgs.builder()
            .name("example")
            .csvClassifier(ClassifierCsvClassifierArgs.builder()
                .allowSingleColumn(false)
                .containsHeader("PRESENT")
                .delimiter(",")
                .disableValueTrimming(false)
                .headers(                
                    "example1",
                    "example2")
                .quoteSymbol("'")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Classifier
    properties:
      name: example
      csvClassifier:
        allowSingleColumn: false
        containsHeader: PRESENT
        delimiter: ','
        disableValueTrimming: false
        headers:
          - example1
          - example2
        quoteSymbol: ''''

When a crawler encounters CSV files, the classifier tells Glue how to parse them. The delimiter property sets the field separator; containsHeader indicates whether the first row contains column names. The headers array defines column names when the file lacks a header row, and quoteSymbol specifies the character used to wrap fields containing delimiters.

Parse log files with grok patterns

Application logs and server access logs follow custom formats that require pattern matching to extract structured fields.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Classifier("example", {
    name: "example",
    grokClassifier: {
        classification: "example",
        grokPattern: "example",
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Classifier("example",
    name="example",
    grok_classifier={
        "classification": "example",
        "grok_pattern": "example",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
			Name: pulumi.String("example"),
			GrokClassifier: &glue.ClassifierGrokClassifierArgs{
				Classification: pulumi.String("example"),
				GrokPattern:    pulumi.String("example"),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Classifier("example", new()
    {
        Name = "example",
        GrokClassifier = new Aws.Glue.Inputs.ClassifierGrokClassifierArgs
        {
            Classification = "example",
            GrokPattern = "example",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierGrokClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Classifier("example", ClassifierArgs.builder()
            .name("example")
            .grokClassifier(ClassifierGrokClassifierArgs.builder()
                .classification("example")
                .grokPattern("example")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Classifier
    properties:
      name: example
      grokClassifier:
        classification: example
        grokPattern: example

The grokPattern property defines a regular expression-like pattern that matches log lines and extracts named fields. The classification property labels the detected format for catalog organization. Glue uses these patterns during crawls to transform unstructured log data into queryable tables.

Extract nested JSON structures with JSONPath

JSON files often contain nested objects or arrays where the actual data records live at a specific path rather than at the root level.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Classifier("example", {
    name: "example",
    jsonClassifier: {
        jsonPath: "example",
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Classifier("example",
    name="example",
    json_classifier={
        "json_path": "example",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
			Name: pulumi.String("example"),
			JsonClassifier: &glue.ClassifierJsonClassifierArgs{
				JsonPath: pulumi.String("example"),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Classifier("example", new()
    {
        Name = "example",
        JsonClassifier = new Aws.Glue.Inputs.ClassifierJsonClassifierArgs
        {
            JsonPath = "example",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierJsonClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Classifier("example", ClassifierArgs.builder()
            .name("example")
            .jsonClassifier(ClassifierJsonClassifierArgs.builder()
                .jsonPath("example")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Classifier
    properties:
      name: example
      jsonClassifier:
        jsonPath: example

The jsonPath property uses JSONPath syntax to locate the array of records within the document structure. When crawlers process JSON files, they follow this path to identify individual records rather than treating the entire document as a single row.

Parse XML documents by row tag

XML data sources organize records within specific tags that define row boundaries for table-like structures.

import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";

const example = new aws.glue.Classifier("example", {
    name: "example",
    xmlClassifier: {
        classification: "example",
        rowTag: "example",
    },
});

import pulumi
import pulumi_aws as aws

example = aws.glue.Classifier("example",
    name="example",
    xml_classifier={
        "classification": "example",
        "row_tag": "example",
    })

package main

import (
	"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
	"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)

func main() {
	pulumi.Run(func(ctx *pulumi.Context) error {
		_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
			Name: pulumi.String("example"),
			XmlClassifier: &glue.ClassifierXmlClassifierArgs{
				Classification: pulumi.String("example"),
				RowTag:         pulumi.String("example"),
			},
		})
		if err != nil {
			return err
		}
		return nil
	})
}

using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;

return await Deployment.RunAsync(() => 
{
    var example = new Aws.Glue.Classifier("example", new()
    {
        Name = "example",
        XmlClassifier = new Aws.Glue.Inputs.ClassifierXmlClassifierArgs
        {
            Classification = "example",
            RowTag = "example",
        },
    });

});

package generated_program;

import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierXmlClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;

public class App {
    public static void main(String[] args) {
        Pulumi.run(App::stack);
    }

    public static void stack(Context ctx) {
        var example = new Classifier("example", ClassifierArgs.builder()
            .name("example")
            .xmlClassifier(ClassifierXmlClassifierArgs.builder()
                .classification("example")
                .rowTag("example")
                .build())
            .build());

    }
}

resources:
  example:
    type: aws:glue:Classifier
    properties:
      name: example
      xmlClassifier:
        classification: example
        rowTag: example

The rowTag property identifies which XML element represents an individual record. The classification property labels the format. Crawlers use this information to split XML documents into rows for catalog tables.

Beyond these examples

These snippets focus on specific classifier features: CSV delimiter and header detection, grok pattern matching for logs, and JSONPath and XML row tag extraction. They’re intentionally minimal rather than complete crawler configurations.

The examples demonstrate classifier configuration without showing the crawler resources that reference them. Each classifier handles one format type; you cannot combine multiple types in a single classifier.

To keep things focused, classifier options are omitted, including:

Custom grok patterns (customPatterns)
Single-column CSV handling (allowSingleColumn)
Value trimming controls (disableValueTrimming)

These omissions are intentional: the goal is to illustrate how each classifier type is wired, not provide drop-in crawler modules. See the Glue Classifier resource reference for all available configuration options.

Let's configure AWS Glue Classifiers

Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.

Try Pulumi Cloud for FREE

Frequently Asked Questions

Classifier Types & Limitations

Can I use multiple classifier types in one resource?

No, you can only specify one classifier type per resource (CSV, Grok, JSON, or XML). If you need multiple types, create separate Classifier resources. Changing from one type to another will recreate the classifier.

What classifier types are available?

Four types are supported: CSV (csvClassifier), Grok (grokClassifier), JSON (jsonClassifier), and XML (xmlClassifier). Each type has its own configuration properties.

What happens if I change my classifier type from CSV to JSON?

The classifier will be destroyed and recreated. Changing classifier types triggers resource recreation, so plan accordingly or create a new classifier with a different name.

Resource Management

Can I rename a classifier after creation?

No, the name property is immutable. To rename a classifier, you must create a new resource with the desired name and delete the old one.

How do I import an existing Glue classifier?

Use pulumi import aws:glue/classifier:Classifier MyClassifier MyClassifier, where the argument is the classifier name.

Using a different cloud?

Explore analytics guides for other cloud providers:

Azure Guides GCP Guides