The aws:glue/classifier:Classifier resource, part of the Pulumi AWS provider, defines custom data format detection rules that Glue crawlers use to infer schema from files. This guide focuses on three capabilities: CSV delimiter and header configuration, grok pattern matching for logs, and JSONPath and XML row tag extraction.
Classifiers work with Glue crawlers to detect file structure and create catalog tables. Each classifier handles one format type; changing types recreates the classifier. The examples are intentionally small. Reference them from your crawler configurations to enable custom format detection.
Parse CSV files with custom delimiters and headers
Data lakes often contain CSV files with varying formats: different delimiters, quoted fields, or custom header rows.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
csvClassifier: {
allowSingleColumn: false,
containsHeader: "PRESENT",
delimiter: ",",
disableValueTrimming: false,
headers: [
"example1",
"example2",
],
quoteSymbol: "'",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
csv_classifier={
"allow_single_column": False,
"contains_header": "PRESENT",
"delimiter": ",",
"disable_value_trimming": False,
"headers": [
"example1",
"example2",
],
"quote_symbol": "'",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
CsvClassifier: &glue.ClassifierCsvClassifierArgs{
AllowSingleColumn: pulumi.Bool(false),
ContainsHeader: pulumi.String("PRESENT"),
Delimiter: pulumi.String(","),
DisableValueTrimming: pulumi.Bool(false),
Headers: pulumi.StringArray{
pulumi.String("example1"),
pulumi.String("example2"),
},
QuoteSymbol: pulumi.String("'"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
CsvClassifier = new Aws.Glue.Inputs.ClassifierCsvClassifierArgs
{
AllowSingleColumn = false,
ContainsHeader = "PRESENT",
Delimiter = ",",
DisableValueTrimming = false,
Headers = new[]
{
"example1",
"example2",
},
QuoteSymbol = "'",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierCsvClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.csvClassifier(ClassifierCsvClassifierArgs.builder()
.allowSingleColumn(false)
.containsHeader("PRESENT")
.delimiter(",")
.disableValueTrimming(false)
.headers(
"example1",
"example2")
.quoteSymbol("'")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
csvClassifier:
allowSingleColumn: false
containsHeader: PRESENT
delimiter: ','
disableValueTrimming: false
headers:
- example1
- example2
quoteSymbol: ''''
When a crawler encounters CSV files, the classifier tells Glue how to parse them. The delimiter property sets the field separator; containsHeader indicates whether the first row contains column names. The headers array defines column names when the file lacks a header row, and quoteSymbol specifies the character used to wrap fields containing delimiters.
Parse log files with grok patterns
Application logs and server access logs follow custom formats that require pattern matching to extract structured fields.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
grokClassifier: {
classification: "example",
grokPattern: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
grok_classifier={
"classification": "example",
"grok_pattern": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
GrokClassifier: &glue.ClassifierGrokClassifierArgs{
Classification: pulumi.String("example"),
GrokPattern: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
GrokClassifier = new Aws.Glue.Inputs.ClassifierGrokClassifierArgs
{
Classification = "example",
GrokPattern = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierGrokClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.grokClassifier(ClassifierGrokClassifierArgs.builder()
.classification("example")
.grokPattern("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
grokClassifier:
classification: example
grokPattern: example
The grokPattern property defines a regular expression-like pattern that matches log lines and extracts named fields. The classification property labels the detected format for catalog organization. Glue uses these patterns during crawls to transform unstructured log data into queryable tables.
Extract nested JSON structures with JSONPath
JSON files often contain nested objects or arrays where the actual data records live at a specific path rather than at the root level.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
jsonClassifier: {
jsonPath: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
json_classifier={
"json_path": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
JsonClassifier: &glue.ClassifierJsonClassifierArgs{
JsonPath: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
JsonClassifier = new Aws.Glue.Inputs.ClassifierJsonClassifierArgs
{
JsonPath = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierJsonClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.jsonClassifier(ClassifierJsonClassifierArgs.builder()
.jsonPath("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
jsonClassifier:
jsonPath: example
The jsonPath property uses JSONPath syntax to locate the array of records within the document structure. When crawlers process JSON files, they follow this path to identify individual records rather than treating the entire document as a single row.
Parse XML documents by row tag
XML data sources organize records within specific tags that define row boundaries for table-like structures.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
xmlClassifier: {
classification: "example",
rowTag: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
xml_classifier={
"classification": "example",
"row_tag": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
XmlClassifier: &glue.ClassifierXmlClassifierArgs{
Classification: pulumi.String("example"),
RowTag: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
XmlClassifier = new Aws.Glue.Inputs.ClassifierXmlClassifierArgs
{
Classification = "example",
RowTag = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierXmlClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.xmlClassifier(ClassifierXmlClassifierArgs.builder()
.classification("example")
.rowTag("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
xmlClassifier:
classification: example
rowTag: example
The rowTag property identifies which XML element represents an individual record. The classification property labels the format. Crawlers use this information to split XML documents into rows for catalog tables.
Beyond these examples
These snippets focus on specific classifier features: CSV delimiter and header detection, grok pattern matching for logs, and JSONPath and XML row tag extraction. They’re intentionally minimal rather than complete crawler configurations.
The examples demonstrate classifier configuration without showing the crawler resources that reference them. Each classifier handles one format type; you cannot combine multiple types in a single classifier.
To keep things focused, classifier options are omitted, including:
- Custom grok patterns (customPatterns)
- Single-column CSV handling (allowSingleColumn)
- Value trimming controls (disableValueTrimming)
These omissions are intentional: the goal is to illustrate how each classifier type is wired, not provide drop-in crawler modules. See the Glue Classifier resource reference for all available configuration options.
Let's configure AWS Glue Classifiers
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Classifier Types & Limitations
csvClassifier), Grok (grokClassifier), JSON (jsonClassifier), and XML (xmlClassifier). Each type has its own configuration properties.Resource Management
name property is immutable. To rename a classifier, you must create a new resource with the desired name and delete the old one.pulumi import aws:glue/classifier:Classifier MyClassifier MyClassifier, where the argument is the classifier name.Using a different cloud?
Explore analytics guides for other cloud providers: