The aws:glue/classifier:Classifier resource, part of the Pulumi AWS provider, defines how Glue crawlers parse and classify data files during catalog discovery. This guide focuses on three capabilities: CSV delimiter and header detection, grok pattern matching for logs, and JSONPath and XML row tag selection.
Each classifier supports exactly one format type (CSV, grok, JSON, or XML). Changing types recreates the classifier. Glue crawlers reference classifiers to detect schemas automatically. The examples are intentionally small. Combine them with your own crawler configurations.
Parse CSV files with custom delimiters and headers
Data lakes ingest CSV files with varying formats: different delimiters, quote characters, and header conventions.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
csvClassifier: {
allowSingleColumn: false,
containsHeader: "PRESENT",
delimiter: ",",
disableValueTrimming: false,
headers: [
"example1",
"example2",
],
quoteSymbol: "'",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
csv_classifier={
"allow_single_column": False,
"contains_header": "PRESENT",
"delimiter": ",",
"disable_value_trimming": False,
"headers": [
"example1",
"example2",
],
"quote_symbol": "'",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
CsvClassifier: &glue.ClassifierCsvClassifierArgs{
AllowSingleColumn: pulumi.Bool(false),
ContainsHeader: pulumi.String("PRESENT"),
Delimiter: pulumi.String(","),
DisableValueTrimming: pulumi.Bool(false),
Headers: pulumi.StringArray{
pulumi.String("example1"),
pulumi.String("example2"),
},
QuoteSymbol: pulumi.String("'"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
CsvClassifier = new Aws.Glue.Inputs.ClassifierCsvClassifierArgs
{
AllowSingleColumn = false,
ContainsHeader = "PRESENT",
Delimiter = ",",
DisableValueTrimming = false,
Headers = new[]
{
"example1",
"example2",
},
QuoteSymbol = "'",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierCsvClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.csvClassifier(ClassifierCsvClassifierArgs.builder()
.allowSingleColumn(false)
.containsHeader("PRESENT")
.delimiter(",")
.disableValueTrimming(false)
.headers(
"example1",
"example2")
.quoteSymbol("'")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
csvClassifier:
allowSingleColumn: false
containsHeader: PRESENT
delimiter: ','
disableValueTrimming: false
headers:
- example1
- example2
quoteSymbol: ''''
The csvClassifier block defines how Glue interprets CSV structure. The delimiter property sets the field separator; containsHeader tells Glue whether the first row contains column names. The headers array provides explicit column names when the file lacks them. The quoteSymbol property handles quoted fields that contain delimiters.
Parse log files with grok patterns
Application logs and server access logs follow custom formats that require pattern matching to extract structured fields.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
grokClassifier: {
classification: "example",
grokPattern: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
grok_classifier={
"classification": "example",
"grok_pattern": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
GrokClassifier: &glue.ClassifierGrokClassifierArgs{
Classification: pulumi.String("example"),
GrokPattern: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
GrokClassifier = new Aws.Glue.Inputs.ClassifierGrokClassifierArgs
{
Classification = "example",
GrokPattern = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierGrokClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.grokClassifier(ClassifierGrokClassifierArgs.builder()
.classification("example")
.grokPattern("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
grokClassifier:
classification: example
grokPattern: example
The grokClassifier block uses grok patterns to parse unstructured text. The grokPattern property defines the extraction pattern; classification sets the format label that appears in the Data Catalog. Glue applies this pattern to each line during crawls.
Parse JSON documents with JSONPath selectors
JSON files often nest data several levels deep. JSONPath expressions tell Glue where to find the records.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
jsonClassifier: {
jsonPath: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
json_classifier={
"json_path": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
JsonClassifier: &glue.ClassifierJsonClassifierArgs{
JsonPath: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
JsonClassifier = new Aws.Glue.Inputs.ClassifierJsonClassifierArgs
{
JsonPath = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierJsonClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.jsonClassifier(ClassifierJsonClassifierArgs.builder()
.jsonPath("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
jsonClassifier:
jsonPath: example
The jsonClassifier block uses JSONPath to locate records within nested JSON. The jsonPath property specifies the path to the array or object that represents individual records. Glue extracts schema from the structure at that path.
Parse XML documents by row tag
XML data sources organize records under repeating element tags. The row tag identifies which element represents a single record.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.glue.Classifier("example", {
name: "example",
xmlClassifier: {
classification: "example",
rowTag: "example",
},
});
import pulumi
import pulumi_aws as aws
example = aws.glue.Classifier("example",
name="example",
xml_classifier={
"classification": "example",
"row_tag": "example",
})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/glue"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := glue.NewClassifier(ctx, "example", &glue.ClassifierArgs{
Name: pulumi.String("example"),
XmlClassifier: &glue.ClassifierXmlClassifierArgs{
Classification: pulumi.String("example"),
RowTag: pulumi.String("example"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Glue.Classifier("example", new()
{
Name = "example",
XmlClassifier = new Aws.Glue.Inputs.ClassifierXmlClassifierArgs
{
Classification = "example",
RowTag = "example",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.glue.Classifier;
import com.pulumi.aws.glue.ClassifierArgs;
import com.pulumi.aws.glue.inputs.ClassifierXmlClassifierArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Classifier("example", ClassifierArgs.builder()
.name("example")
.xmlClassifier(ClassifierXmlClassifierArgs.builder()
.classification("example")
.rowTag("example")
.build())
.build());
}
}
resources:
example:
type: aws:glue:Classifier
properties:
name: example
xmlClassifier:
classification: example
rowTag: example
The xmlClassifier block defines XML parsing rules. The rowTag property specifies the element name that wraps each record; classification sets the format label. Glue treats each occurrence of this tag as a separate row during schema detection.
Beyond these examples
These snippets focus on specific classifier features: CSV, grok, JSON, and XML parsing. They’re intentionally minimal rather than full crawler configurations.
The examples don’t require pre-existing infrastructure. They focus on classifier configuration; crawler setup is outside their scope.
To keep things focused, classifier options are omitted, including:
- Custom grok patterns (customPatterns)
- Single-column CSV handling (allowSingleColumn)
- Value trimming control (disableValueTrimming)
These omissions are intentional: the goal is to illustrate how each classifier type is wired, not provide drop-in data catalog modules. See the Glue Classifier resource reference for all available configuration options.
Let's configure AWS Glue Classifiers
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Classifier Types & Configuration
csvClassifier), grok (grokClassifier), JSON (jsonClassifier), and XML (xmlClassifier). Choose the type that matches your data format.Each type has different requirements:
- CSV: Optional fields like
delimiter,containsHeader,headers, andquoteSymbol - Grok: Requires
classificationandgrokPattern - JSON: Requires
jsonPath - XML: Requires
classificationandrowTag
Resource Lifecycle & Immutability
name property is immutable. Changing it will force resource recreation.Using a different cloud?
Explore analytics guides for other cloud providers: