The gcp:dataplex/datascan:Datascan resource, part of the Pulumi GCP provider, defines Dataplex datascans that profile, validate, discover, or document data sources. This guide focuses on four capabilities: data profiling with sampling and field filtering, quality validation with column-level rules, schema discovery with BigQuery publishing, and table documentation generation.
Datascans reference BigQuery tables or Cloud Storage buckets that must exist, and may require BigQuery connections for BigLake publishing. The examples are intentionally small. Combine them with your own data sources, IAM roles, and notification targets.
Profile BigQuery tables with on-demand execution
Data teams often start by profiling BigQuery tables to understand column distributions, null rates, and data types.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const basicProfile = new gcp.dataplex.Datascan("basic_profile", {
location: "us-central1",
dataScanId: "dataprofile-basic",
data: {
resource: "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
executionSpec: {
trigger: {
onDemand: {},
},
},
dataProfileSpec: {},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
basic_profile = gcp.dataplex.Datascan("basic_profile",
location="us-central1",
data_scan_id="dataprofile-basic",
data={
"resource": "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
execution_spec={
"trigger": {
"on_demand": {},
},
},
data_profile_spec={},
project="my-project-name")
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataplex.NewDatascan(ctx, "basic_profile", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DataScanId: pulumi.String("dataprofile-basic"),
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.String("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare"),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
OnDemand: &dataplex.DatascanExecutionSpecTriggerOnDemandArgs{},
},
},
DataProfileSpec: &dataplex.DatascanDataProfileSpecArgs{},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var basicProfile = new Gcp.DataPlex.Datascan("basic_profile", new()
{
Location = "us-central1",
DataScanId = "dataprofile-basic",
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
OnDemand = null,
},
},
DataProfileSpec = null,
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerOnDemandArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var basicProfile = new Datascan("basicProfile", DatascanArgs.builder()
.location("us-central1")
.dataScanId("dataprofile-basic")
.data(DatascanDataArgs.builder()
.resource("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare")
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.onDemand(DatascanExecutionSpecTriggerOnDemandArgs.builder()
.build())
.build())
.build())
.dataProfileSpec(DatascanDataProfileSpecArgs.builder()
.build())
.project("my-project-name")
.build());
}
}
resources:
basicProfile:
type: gcp:dataplex:Datascan
name: basic_profile
properties:
location: us-central1
dataScanId: dataprofile-basic
data:
resource: //bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare
executionSpec:
trigger:
onDemand: {}
dataProfileSpec: {}
project: my-project-name
The dataProfileSpec enables profiling mode. The data.resource property points to a BigQuery table using the resource URL format. The executionSpec.trigger.onDemand property runs the scan manually rather than on a schedule.
Schedule profiling with sampling and field filtering
Production profiling workflows often need to control which columns are analyzed and how much data is sampled, then export results to BigQuery.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const source = new gcp.bigquery.Dataset("source", {
datasetId: "dataplex_dataset",
friendlyName: "test",
description: "This is a test description",
location: "US",
deleteContentsOnDestroy: true,
});
const fullProfile = new gcp.dataplex.Datascan("full_profile", {
location: "us-central1",
displayName: "Full Datascan Profile",
dataScanId: "dataprofile-full",
description: "Example resource - Full Datascan Profile",
labels: {
author: "billing",
},
data: {
resource: "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
executionSpec: {
trigger: {
schedule: {
cron: "TZ=America/New_York 1 1 * * *",
},
},
},
dataProfileSpec: {
samplingPercent: 80,
rowFilter: "word_count > 10",
includeFields: {
fieldNames: ["word_count"],
},
excludeFields: {
fieldNames: ["property_type"],
},
postScanActions: {
bigqueryExport: {
resultsTable: "//bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export",
},
},
catalogPublishingEnabled: true,
},
project: "my-project-name",
}, {
dependsOn: [source],
});
import pulumi
import pulumi_gcp as gcp
source = gcp.bigquery.Dataset("source",
dataset_id="dataplex_dataset",
friendly_name="test",
description="This is a test description",
location="US",
delete_contents_on_destroy=True)
full_profile = gcp.dataplex.Datascan("full_profile",
location="us-central1",
display_name="Full Datascan Profile",
data_scan_id="dataprofile-full",
description="Example resource - Full Datascan Profile",
labels={
"author": "billing",
},
data={
"resource": "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
execution_spec={
"trigger": {
"schedule": {
"cron": "TZ=America/New_York 1 1 * * *",
},
},
},
data_profile_spec={
"sampling_percent": 80,
"row_filter": "word_count > 10",
"include_fields": {
"field_names": ["word_count"],
},
"exclude_fields": {
"field_names": ["property_type"],
},
"post_scan_actions": {
"bigquery_export": {
"results_table": "//bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export",
},
},
"catalog_publishing_enabled": True,
},
project="my-project-name",
opts = pulumi.ResourceOptions(depends_on=[source]))
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/bigquery"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
source, err := bigquery.NewDataset(ctx, "source", &bigquery.DatasetArgs{
DatasetId: pulumi.String("dataplex_dataset"),
FriendlyName: pulumi.String("test"),
Description: pulumi.String("This is a test description"),
Location: pulumi.String("US"),
DeleteContentsOnDestroy: pulumi.Bool(true),
})
if err != nil {
return err
}
_, err = dataplex.NewDatascan(ctx, "full_profile", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DisplayName: pulumi.String("Full Datascan Profile"),
DataScanId: pulumi.String("dataprofile-full"),
Description: pulumi.String("Example resource - Full Datascan Profile"),
Labels: pulumi.StringMap{
"author": pulumi.String("billing"),
},
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.String("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare"),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
Schedule: &dataplex.DatascanExecutionSpecTriggerScheduleArgs{
Cron: pulumi.String("TZ=America/New_York 1 1 * * *"),
},
},
},
DataProfileSpec: &dataplex.DatascanDataProfileSpecArgs{
SamplingPercent: pulumi.Float64(80),
RowFilter: pulumi.String("word_count > 10"),
IncludeFields: &dataplex.DatascanDataProfileSpecIncludeFieldsArgs{
FieldNames: pulumi.StringArray{
pulumi.String("word_count"),
},
},
ExcludeFields: &dataplex.DatascanDataProfileSpecExcludeFieldsArgs{
FieldNames: pulumi.StringArray{
pulumi.String("property_type"),
},
},
PostScanActions: &dataplex.DatascanDataProfileSpecPostScanActionsArgs{
BigqueryExport: &dataplex.DatascanDataProfileSpecPostScanActionsBigqueryExportArgs{
ResultsTable: pulumi.String("//bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export"),
},
},
CatalogPublishingEnabled: pulumi.Bool(true),
},
Project: pulumi.String("my-project-name"),
}, pulumi.DependsOn([]pulumi.Resource{
source,
}))
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var source = new Gcp.BigQuery.Dataset("source", new()
{
DatasetId = "dataplex_dataset",
FriendlyName = "test",
Description = "This is a test description",
Location = "US",
DeleteContentsOnDestroy = true,
});
var fullProfile = new Gcp.DataPlex.Datascan("full_profile", new()
{
Location = "us-central1",
DisplayName = "Full Datascan Profile",
DataScanId = "dataprofile-full",
Description = "Example resource - Full Datascan Profile",
Labels =
{
{ "author", "billing" },
},
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
Schedule = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerScheduleArgs
{
Cron = "TZ=America/New_York 1 1 * * *",
},
},
},
DataProfileSpec = new Gcp.DataPlex.Inputs.DatascanDataProfileSpecArgs
{
SamplingPercent = 80,
RowFilter = "word_count > 10",
IncludeFields = new Gcp.DataPlex.Inputs.DatascanDataProfileSpecIncludeFieldsArgs
{
FieldNames = new[]
{
"word_count",
},
},
ExcludeFields = new Gcp.DataPlex.Inputs.DatascanDataProfileSpecExcludeFieldsArgs
{
FieldNames = new[]
{
"property_type",
},
},
PostScanActions = new Gcp.DataPlex.Inputs.DatascanDataProfileSpecPostScanActionsArgs
{
BigqueryExport = new Gcp.DataPlex.Inputs.DatascanDataProfileSpecPostScanActionsBigqueryExportArgs
{
ResultsTable = "//bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export",
},
},
CatalogPublishingEnabled = true,
},
Project = "my-project-name",
}, new CustomResourceOptions
{
DependsOn =
{
source,
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.bigquery.Dataset;
import com.pulumi.gcp.bigquery.DatasetArgs;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerScheduleArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecIncludeFieldsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecExcludeFieldsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecPostScanActionsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataProfileSpecPostScanActionsBigqueryExportArgs;
import com.pulumi.resources.CustomResourceOptions;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var source = new Dataset("source", DatasetArgs.builder()
.datasetId("dataplex_dataset")
.friendlyName("test")
.description("This is a test description")
.location("US")
.deleteContentsOnDestroy(true)
.build());
var fullProfile = new Datascan("fullProfile", DatascanArgs.builder()
.location("us-central1")
.displayName("Full Datascan Profile")
.dataScanId("dataprofile-full")
.description("Example resource - Full Datascan Profile")
.labels(Map.of("author", "billing"))
.data(DatascanDataArgs.builder()
.resource("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare")
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.schedule(DatascanExecutionSpecTriggerScheduleArgs.builder()
.cron("TZ=America/New_York 1 1 * * *")
.build())
.build())
.build())
.dataProfileSpec(DatascanDataProfileSpecArgs.builder()
.samplingPercent(80.0)
.rowFilter("word_count > 10")
.includeFields(DatascanDataProfileSpecIncludeFieldsArgs.builder()
.fieldNames("word_count")
.build())
.excludeFields(DatascanDataProfileSpecExcludeFieldsArgs.builder()
.fieldNames("property_type")
.build())
.postScanActions(DatascanDataProfileSpecPostScanActionsArgs.builder()
.bigqueryExport(DatascanDataProfileSpecPostScanActionsBigqueryExportArgs.builder()
.resultsTable("//bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export")
.build())
.build())
.catalogPublishingEnabled(true)
.build())
.project("my-project-name")
.build(), CustomResourceOptions.builder()
.dependsOn(source)
.build());
}
}
resources:
fullProfile:
type: gcp:dataplex:Datascan
name: full_profile
properties:
location: us-central1
displayName: Full Datascan Profile
dataScanId: dataprofile-full
description: Example resource - Full Datascan Profile
labels:
author: billing
data:
resource: //bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare
executionSpec:
trigger:
schedule:
cron: TZ=America/New_York 1 1 * * *
dataProfileSpec:
samplingPercent: 80
rowFilter: word_count > 10
includeFields:
fieldNames:
- word_count
excludeFields:
fieldNames:
- property_type
postScanActions:
bigqueryExport:
resultsTable: //bigquery.googleapis.com/projects/my-project-name/datasets/dataplex_dataset/tables/profile_export
catalogPublishingEnabled: true
project: my-project-name
options:
dependsOn:
- ${source}
source:
type: gcp:bigquery:Dataset
properties:
datasetId: dataplex_dataset
friendlyName: test
description: This is a test description
location: US
deleteContentsOnDestroy: true
The executionSpec.trigger.schedule property runs scans on a cron schedule. The samplingPercent property limits analysis to a subset of rows. The includeFields and excludeFields properties control which columns are profiled. The postScanActions.bigqueryExport property writes results to a BigQuery table for downstream analysis.
Validate table-level conditions with quality rules
Data quality checks ensure tables meet expectations before downstream processing.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const basicQuality = new gcp.dataplex.Datascan("basic_quality", {
location: "us-central1",
dataScanId: "dataquality-basic",
data: {
resource: "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
executionSpec: {
trigger: {
onDemand: {},
},
},
dataQualitySpec: {
rules: [{
dimension: "VALIDITY",
name: "rule1",
description: "rule 1 for validity dimension",
tableConditionExpectation: {
sqlExpression: "COUNT(*) > 0",
},
}],
},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
basic_quality = gcp.dataplex.Datascan("basic_quality",
location="us-central1",
data_scan_id="dataquality-basic",
data={
"resource": "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
execution_spec={
"trigger": {
"on_demand": {},
},
},
data_quality_spec={
"rules": [{
"dimension": "VALIDITY",
"name": "rule1",
"description": "rule 1 for validity dimension",
"table_condition_expectation": {
"sql_expression": "COUNT(*) > 0",
},
}],
},
project="my-project-name")
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataplex.NewDatascan(ctx, "basic_quality", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DataScanId: pulumi.String("dataquality-basic"),
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.String("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare"),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
OnDemand: &dataplex.DatascanExecutionSpecTriggerOnDemandArgs{},
},
},
DataQualitySpec: &dataplex.DatascanDataQualitySpecArgs{
Rules: dataplex.DatascanDataQualitySpecRuleArray{
&dataplex.DatascanDataQualitySpecRuleArgs{
Dimension: pulumi.String("VALIDITY"),
Name: pulumi.String("rule1"),
Description: pulumi.String("rule 1 for validity dimension"),
TableConditionExpectation: &dataplex.DatascanDataQualitySpecRuleTableConditionExpectationArgs{
SqlExpression: pulumi.String("COUNT(*) > 0"),
},
},
},
},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var basicQuality = new Gcp.DataPlex.Datascan("basic_quality", new()
{
Location = "us-central1",
DataScanId = "dataquality-basic",
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare",
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
OnDemand = null,
},
},
DataQualitySpec = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecArgs
{
Rules = new[]
{
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Dimension = "VALIDITY",
Name = "rule1",
Description = "rule 1 for validity dimension",
TableConditionExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleTableConditionExpectationArgs
{
SqlExpression = "COUNT(*) > 0",
},
},
},
},
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerOnDemandArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var basicQuality = new Datascan("basicQuality", DatascanArgs.builder()
.location("us-central1")
.dataScanId("dataquality-basic")
.data(DatascanDataArgs.builder()
.resource("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare")
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.onDemand(DatascanExecutionSpecTriggerOnDemandArgs.builder()
.build())
.build())
.build())
.dataQualitySpec(DatascanDataQualitySpecArgs.builder()
.rules(DatascanDataQualitySpecRuleArgs.builder()
.dimension("VALIDITY")
.name("rule1")
.description("rule 1 for validity dimension")
.tableConditionExpectation(DatascanDataQualitySpecRuleTableConditionExpectationArgs.builder()
.sqlExpression("COUNT(*) > 0")
.build())
.build())
.build())
.project("my-project-name")
.build());
}
}
resources:
basicQuality:
type: gcp:dataplex:Datascan
name: basic_quality
properties:
location: us-central1
dataScanId: dataquality-basic
data:
resource: //bigquery.googleapis.com/projects/bigquery-public-data/datasets/samples/tables/shakespeare
executionSpec:
trigger:
onDemand: {}
dataQualitySpec:
rules:
- dimension: VALIDITY
name: rule1
description: rule 1 for validity dimension
tableConditionExpectation:
sqlExpression: COUNT(*) > 0
project: my-project-name
The dataQualitySpec enables quality validation mode. Each rule defines a dimension (VALIDITY, UNIQUENESS, etc.) and an expectation. The tableConditionExpectation validates SQL expressions against the entire table, such as checking that row counts exceed zero.
Enforce column-level quality with multiple rule types
Comprehensive quality checks validate individual columns using expectations for nullability, ranges, patterns, and uniqueness.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const fullQuality = new gcp.dataplex.Datascan("full_quality", {
location: "us-central1",
displayName: "Full Datascan Quality",
dataScanId: "dataquality-full",
description: "Example resource - Full Datascan Quality",
labels: {
author: "billing",
},
data: {
resource: "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations",
},
executionSpec: {
trigger: {
schedule: {
cron: "TZ=America/New_York 1 1 * * *",
},
},
field: "modified_date",
},
dataQualitySpec: {
samplingPercent: 5,
rowFilter: "station_id > 1000",
catalogPublishingEnabled: true,
postScanActions: {
notificationReport: {
recipients: {
emails: ["jane.doe@example.com"],
},
scoreThresholdTrigger: {
scoreThreshold: 86,
},
},
},
rules: [
{
column: "address",
dimension: "VALIDITY",
threshold: 0.99,
nonNullExpectation: {},
},
{
column: "council_district",
dimension: "VALIDITY",
ignoreNull: true,
threshold: 0.9,
rangeExpectation: {
minValue: "1",
maxValue: "10",
strictMinEnabled: true,
strictMaxEnabled: false,
},
},
{
column: "power_type",
dimension: "VALIDITY",
ignoreNull: false,
regexExpectation: {
regex: ".*solar.*",
},
},
{
column: "property_type",
dimension: "VALIDITY",
ignoreNull: false,
setExpectation: {
values: [
"sidewalk",
"parkland",
],
},
},
{
column: "address",
dimension: "UNIQUENESS",
uniquenessExpectation: {},
},
{
column: "number_of_docks",
dimension: "VALIDITY",
statisticRangeExpectation: {
statistic: "MEAN",
minValue: "5",
maxValue: "15",
strictMinEnabled: true,
strictMaxEnabled: true,
},
},
{
column: "footprint_length",
dimension: "VALIDITY",
rowConditionExpectation: {
sqlExpression: "footprint_length > 0 AND footprint_length <= 10",
},
},
{
dimension: "VALIDITY",
tableConditionExpectation: {
sqlExpression: "COUNT(*) > 0",
},
},
{
dimension: "VALIDITY",
sqlAssertion: {
sqlStatement: "select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null",
},
},
],
},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
full_quality = gcp.dataplex.Datascan("full_quality",
location="us-central1",
display_name="Full Datascan Quality",
data_scan_id="dataquality-full",
description="Example resource - Full Datascan Quality",
labels={
"author": "billing",
},
data={
"resource": "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations",
},
execution_spec={
"trigger": {
"schedule": {
"cron": "TZ=America/New_York 1 1 * * *",
},
},
"field": "modified_date",
},
data_quality_spec={
"sampling_percent": 5,
"row_filter": "station_id > 1000",
"catalog_publishing_enabled": True,
"post_scan_actions": {
"notification_report": {
"recipients": {
"emails": ["jane.doe@example.com"],
},
"score_threshold_trigger": {
"score_threshold": 86,
},
},
},
"rules": [
{
"column": "address",
"dimension": "VALIDITY",
"threshold": 0.99,
"non_null_expectation": {},
},
{
"column": "council_district",
"dimension": "VALIDITY",
"ignore_null": True,
"threshold": 0.9,
"range_expectation": {
"min_value": "1",
"max_value": "10",
"strict_min_enabled": True,
"strict_max_enabled": False,
},
},
{
"column": "power_type",
"dimension": "VALIDITY",
"ignore_null": False,
"regex_expectation": {
"regex": ".*solar.*",
},
},
{
"column": "property_type",
"dimension": "VALIDITY",
"ignore_null": False,
"set_expectation": {
"values": [
"sidewalk",
"parkland",
],
},
},
{
"column": "address",
"dimension": "UNIQUENESS",
"uniqueness_expectation": {},
},
{
"column": "number_of_docks",
"dimension": "VALIDITY",
"statistic_range_expectation": {
"statistic": "MEAN",
"min_value": "5",
"max_value": "15",
"strict_min_enabled": True,
"strict_max_enabled": True,
},
},
{
"column": "footprint_length",
"dimension": "VALIDITY",
"row_condition_expectation": {
"sql_expression": "footprint_length > 0 AND footprint_length <= 10",
},
},
{
"dimension": "VALIDITY",
"table_condition_expectation": {
"sql_expression": "COUNT(*) > 0",
},
},
{
"dimension": "VALIDITY",
"sql_assertion": {
"sql_statement": "select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null",
},
},
],
},
project="my-project-name")
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataplex.NewDatascan(ctx, "full_quality", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DisplayName: pulumi.String("Full Datascan Quality"),
DataScanId: pulumi.String("dataquality-full"),
Description: pulumi.String("Example resource - Full Datascan Quality"),
Labels: pulumi.StringMap{
"author": pulumi.String("billing"),
},
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.String("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations"),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
Schedule: &dataplex.DatascanExecutionSpecTriggerScheduleArgs{
Cron: pulumi.String("TZ=America/New_York 1 1 * * *"),
},
},
Field: pulumi.String("modified_date"),
},
DataQualitySpec: &dataplex.DatascanDataQualitySpecArgs{
SamplingPercent: pulumi.Float64(5),
RowFilter: pulumi.String("station_id > 1000"),
CatalogPublishingEnabled: pulumi.Bool(true),
PostScanActions: &dataplex.DatascanDataQualitySpecPostScanActionsArgs{
NotificationReport: &dataplex.DatascanDataQualitySpecPostScanActionsNotificationReportArgs{
Recipients: &dataplex.DatascanDataQualitySpecPostScanActionsNotificationReportRecipientsArgs{
Emails: pulumi.StringArray{
pulumi.String("jane.doe@example.com"),
},
},
ScoreThresholdTrigger: &dataplex.DatascanDataQualitySpecPostScanActionsNotificationReportScoreThresholdTriggerArgs{
ScoreThreshold: pulumi.Float64(86),
},
},
},
Rules: dataplex.DatascanDataQualitySpecRuleArray{
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("address"),
Dimension: pulumi.String("VALIDITY"),
Threshold: pulumi.Float64(0.99),
NonNullExpectation: &dataplex.DatascanDataQualitySpecRuleNonNullExpectationArgs{},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("council_district"),
Dimension: pulumi.String("VALIDITY"),
IgnoreNull: pulumi.Bool(true),
Threshold: pulumi.Float64(0.9),
RangeExpectation: &dataplex.DatascanDataQualitySpecRuleRangeExpectationArgs{
MinValue: pulumi.String("1"),
MaxValue: pulumi.String("10"),
StrictMinEnabled: pulumi.Bool(true),
StrictMaxEnabled: pulumi.Bool(false),
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("power_type"),
Dimension: pulumi.String("VALIDITY"),
IgnoreNull: pulumi.Bool(false),
RegexExpectation: &dataplex.DatascanDataQualitySpecRuleRegexExpectationArgs{
Regex: pulumi.String(".*solar.*"),
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("property_type"),
Dimension: pulumi.String("VALIDITY"),
IgnoreNull: pulumi.Bool(false),
SetExpectation: &dataplex.DatascanDataQualitySpecRuleSetExpectationArgs{
Values: pulumi.StringArray{
pulumi.String("sidewalk"),
pulumi.String("parkland"),
},
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("address"),
Dimension: pulumi.String("UNIQUENESS"),
UniquenessExpectation: &dataplex.DatascanDataQualitySpecRuleUniquenessExpectationArgs{},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("number_of_docks"),
Dimension: pulumi.String("VALIDITY"),
StatisticRangeExpectation: &dataplex.DatascanDataQualitySpecRuleStatisticRangeExpectationArgs{
Statistic: pulumi.String("MEAN"),
MinValue: pulumi.String("5"),
MaxValue: pulumi.String("15"),
StrictMinEnabled: pulumi.Bool(true),
StrictMaxEnabled: pulumi.Bool(true),
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Column: pulumi.String("footprint_length"),
Dimension: pulumi.String("VALIDITY"),
RowConditionExpectation: &dataplex.DatascanDataQualitySpecRuleRowConditionExpectationArgs{
SqlExpression: pulumi.String("footprint_length > 0 AND footprint_length <= 10"),
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Dimension: pulumi.String("VALIDITY"),
TableConditionExpectation: &dataplex.DatascanDataQualitySpecRuleTableConditionExpectationArgs{
SqlExpression: pulumi.String("COUNT(*) > 0"),
},
},
&dataplex.DatascanDataQualitySpecRuleArgs{
Dimension: pulumi.String("VALIDITY"),
SqlAssertion: &dataplex.DatascanDataQualitySpecRuleSqlAssertionArgs{
SqlStatement: pulumi.String("select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null"),
},
},
},
},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var fullQuality = new Gcp.DataPlex.Datascan("full_quality", new()
{
Location = "us-central1",
DisplayName = "Full Datascan Quality",
DataScanId = "dataquality-full",
Description = "Example resource - Full Datascan Quality",
Labels =
{
{ "author", "billing" },
},
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = "//bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations",
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
Schedule = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerScheduleArgs
{
Cron = "TZ=America/New_York 1 1 * * *",
},
},
Field = "modified_date",
},
DataQualitySpec = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecArgs
{
SamplingPercent = 5,
RowFilter = "station_id > 1000",
CatalogPublishingEnabled = true,
PostScanActions = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecPostScanActionsArgs
{
NotificationReport = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecPostScanActionsNotificationReportArgs
{
Recipients = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecPostScanActionsNotificationReportRecipientsArgs
{
Emails = new[]
{
"jane.doe@example.com",
},
},
ScoreThresholdTrigger = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecPostScanActionsNotificationReportScoreThresholdTriggerArgs
{
ScoreThreshold = 86,
},
},
},
Rules = new[]
{
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "address",
Dimension = "VALIDITY",
Threshold = 0.99,
NonNullExpectation = null,
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "council_district",
Dimension = "VALIDITY",
IgnoreNull = true,
Threshold = 0.9,
RangeExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleRangeExpectationArgs
{
MinValue = "1",
MaxValue = "10",
StrictMinEnabled = true,
StrictMaxEnabled = false,
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "power_type",
Dimension = "VALIDITY",
IgnoreNull = false,
RegexExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleRegexExpectationArgs
{
Regex = ".*solar.*",
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "property_type",
Dimension = "VALIDITY",
IgnoreNull = false,
SetExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleSetExpectationArgs
{
Values = new[]
{
"sidewalk",
"parkland",
},
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "address",
Dimension = "UNIQUENESS",
UniquenessExpectation = null,
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "number_of_docks",
Dimension = "VALIDITY",
StatisticRangeExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleStatisticRangeExpectationArgs
{
Statistic = "MEAN",
MinValue = "5",
MaxValue = "15",
StrictMinEnabled = true,
StrictMaxEnabled = true,
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Column = "footprint_length",
Dimension = "VALIDITY",
RowConditionExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleRowConditionExpectationArgs
{
SqlExpression = "footprint_length > 0 AND footprint_length <= 10",
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Dimension = "VALIDITY",
TableConditionExpectation = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleTableConditionExpectationArgs
{
SqlExpression = "COUNT(*) > 0",
},
},
new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleArgs
{
Dimension = "VALIDITY",
SqlAssertion = new Gcp.DataPlex.Inputs.DatascanDataQualitySpecRuleSqlAssertionArgs
{
SqlStatement = "select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null",
},
},
},
},
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerScheduleArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecPostScanActionsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecPostScanActionsNotificationReportArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecPostScanActionsNotificationReportRecipientsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataQualitySpecPostScanActionsNotificationReportScoreThresholdTriggerArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var fullQuality = new Datascan("fullQuality", DatascanArgs.builder()
.location("us-central1")
.displayName("Full Datascan Quality")
.dataScanId("dataquality-full")
.description("Example resource - Full Datascan Quality")
.labels(Map.of("author", "billing"))
.data(DatascanDataArgs.builder()
.resource("//bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations")
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.schedule(DatascanExecutionSpecTriggerScheduleArgs.builder()
.cron("TZ=America/New_York 1 1 * * *")
.build())
.build())
.field("modified_date")
.build())
.dataQualitySpec(DatascanDataQualitySpecArgs.builder()
.samplingPercent(5.0)
.rowFilter("station_id > 1000")
.catalogPublishingEnabled(true)
.postScanActions(DatascanDataQualitySpecPostScanActionsArgs.builder()
.notificationReport(DatascanDataQualitySpecPostScanActionsNotificationReportArgs.builder()
.recipients(DatascanDataQualitySpecPostScanActionsNotificationReportRecipientsArgs.builder()
.emails("jane.doe@example.com")
.build())
.scoreThresholdTrigger(DatascanDataQualitySpecPostScanActionsNotificationReportScoreThresholdTriggerArgs.builder()
.scoreThreshold(86.0)
.build())
.build())
.build())
.rules(
DatascanDataQualitySpecRuleArgs.builder()
.column("address")
.dimension("VALIDITY")
.threshold(0.99)
.nonNullExpectation(DatascanDataQualitySpecRuleNonNullExpectationArgs.builder()
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("council_district")
.dimension("VALIDITY")
.ignoreNull(true)
.threshold(0.9)
.rangeExpectation(DatascanDataQualitySpecRuleRangeExpectationArgs.builder()
.minValue("1")
.maxValue("10")
.strictMinEnabled(true)
.strictMaxEnabled(false)
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("power_type")
.dimension("VALIDITY")
.ignoreNull(false)
.regexExpectation(DatascanDataQualitySpecRuleRegexExpectationArgs.builder()
.regex(".*solar.*")
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("property_type")
.dimension("VALIDITY")
.ignoreNull(false)
.setExpectation(DatascanDataQualitySpecRuleSetExpectationArgs.builder()
.values(
"sidewalk",
"parkland")
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("address")
.dimension("UNIQUENESS")
.uniquenessExpectation(DatascanDataQualitySpecRuleUniquenessExpectationArgs.builder()
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("number_of_docks")
.dimension("VALIDITY")
.statisticRangeExpectation(DatascanDataQualitySpecRuleStatisticRangeExpectationArgs.builder()
.statistic("MEAN")
.minValue("5")
.maxValue("15")
.strictMinEnabled(true)
.strictMaxEnabled(true)
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.column("footprint_length")
.dimension("VALIDITY")
.rowConditionExpectation(DatascanDataQualitySpecRuleRowConditionExpectationArgs.builder()
.sqlExpression("footprint_length > 0 AND footprint_length <= 10")
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.dimension("VALIDITY")
.tableConditionExpectation(DatascanDataQualitySpecRuleTableConditionExpectationArgs.builder()
.sqlExpression("COUNT(*) > 0")
.build())
.build(),
DatascanDataQualitySpecRuleArgs.builder()
.dimension("VALIDITY")
.sqlAssertion(DatascanDataQualitySpecRuleSqlAssertionArgs.builder()
.sqlStatement("select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null")
.build())
.build())
.build())
.project("my-project-name")
.build());
}
}
resources:
fullQuality:
type: gcp:dataplex:Datascan
name: full_quality
properties:
location: us-central1
displayName: Full Datascan Quality
dataScanId: dataquality-full
description: Example resource - Full Datascan Quality
labels:
author: billing
data:
resource: //bigquery.googleapis.com/projects/bigquery-public-data/datasets/austin_bikeshare/tables/bikeshare_stations
executionSpec:
trigger:
schedule:
cron: TZ=America/New_York 1 1 * * *
field: modified_date
dataQualitySpec:
samplingPercent: 5
rowFilter: station_id > 1000
catalogPublishingEnabled: true
postScanActions:
notificationReport:
recipients:
emails:
- jane.doe@example.com
scoreThresholdTrigger:
scoreThreshold: 86
rules:
- column: address
dimension: VALIDITY
threshold: 0.99
nonNullExpectation: {}
- column: council_district
dimension: VALIDITY
ignoreNull: true
threshold: 0.9
rangeExpectation:
minValue: 1
maxValue: 10
strictMinEnabled: true
strictMaxEnabled: false
- column: power_type
dimension: VALIDITY
ignoreNull: false
regexExpectation:
regex: .*solar.*
- column: property_type
dimension: VALIDITY
ignoreNull: false
setExpectation:
values:
- sidewalk
- parkland
- column: address
dimension: UNIQUENESS
uniquenessExpectation: {}
- column: number_of_docks
dimension: VALIDITY
statisticRangeExpectation:
statistic: MEAN
minValue: 5
maxValue: 15
strictMinEnabled: true
strictMaxEnabled: true
- column: footprint_length
dimension: VALIDITY
rowConditionExpectation:
sqlExpression: footprint_length > 0 AND footprint_length <= 10
- dimension: VALIDITY
tableConditionExpectation:
sqlExpression: COUNT(*) > 0
- dimension: VALIDITY
sqlAssertion:
sqlStatement: select * from bigquery-public-data.austin_bikeshare.bikeshare_stations where station_id is null
project: my-project-name
Column-level rules specify which column to validate. The nonNullExpectation checks for missing values. The rangeExpectation validates numeric bounds with optional strict comparisons. The regexExpectation matches string patterns. The setExpectation checks for allowed values. The uniquenessExpectation detects duplicates. The postScanActions.notificationReport sends email alerts when quality scores drop below thresholds.
Discover Cloud Storage schemas with on-demand scans
Cloud Storage buckets often contain files without explicit schemas. Discovery scans infer schemas by sampling files.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const tfTestBucket = new gcp.storage.Bucket("tf_test_bucket", {
name: "tf-test-bucket-name-_72490",
location: "us-west1",
uniformBucketLevelAccess: true,
});
const basicDiscovery = new gcp.dataplex.Datascan("basic_discovery", {
location: "us-central1",
dataScanId: "datadiscovery-basic",
data: {
resource: pulumi.interpolate`//storage.googleapis.com/projects/${tfTestBucket.project}/buckets/${tfTestBucket.name}`,
},
executionSpec: {
trigger: {
onDemand: {},
},
},
dataDiscoverySpec: {},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
tf_test_bucket = gcp.storage.Bucket("tf_test_bucket",
name="tf-test-bucket-name-_72490",
location="us-west1",
uniform_bucket_level_access=True)
basic_discovery = gcp.dataplex.Datascan("basic_discovery",
location="us-central1",
data_scan_id="datadiscovery-basic",
data={
"resource": pulumi.Output.all(
project=tf_test_bucket.project,
name=tf_test_bucket.name
).apply(lambda resolved_outputs: f"//storage.googleapis.com/projects/{resolved_outputs['project']}/buckets/{resolved_outputs['name']}")
,
},
execution_spec={
"trigger": {
"on_demand": {},
},
},
data_discovery_spec={},
project="my-project-name")
package main
import (
"fmt"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/storage"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
tfTestBucket, err := storage.NewBucket(ctx, "tf_test_bucket", &storage.BucketArgs{
Name: pulumi.String("tf-test-bucket-name-_72490"),
Location: pulumi.String("us-west1"),
UniformBucketLevelAccess: pulumi.Bool(true),
})
if err != nil {
return err
}
_, err = dataplex.NewDatascan(ctx, "basic_discovery", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DataScanId: pulumi.String("datadiscovery-basic"),
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.All(tfTestBucket.Project, tfTestBucket.Name).ApplyT(func(_args []interface{}) (string, error) {
project := _args[0].(string)
name := _args[1].(string)
return fmt.Sprintf("//storage.googleapis.com/projects/%v/buckets/%v", project, name), nil
}).(pulumi.StringOutput),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
OnDemand: &dataplex.DatascanExecutionSpecTriggerOnDemandArgs{},
},
},
DataDiscoverySpec: &dataplex.DatascanDataDiscoverySpecArgs{},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var tfTestBucket = new Gcp.Storage.Bucket("tf_test_bucket", new()
{
Name = "tf-test-bucket-name-_72490",
Location = "us-west1",
UniformBucketLevelAccess = true,
});
var basicDiscovery = new Gcp.DataPlex.Datascan("basic_discovery", new()
{
Location = "us-central1",
DataScanId = "datadiscovery-basic",
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = Output.Tuple(tfTestBucket.Project, tfTestBucket.Name).Apply(values =>
{
var project = values.Item1;
var name = values.Item2;
return $"//storage.googleapis.com/projects/{project}/buckets/{name}";
}),
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
OnDemand = null,
},
},
DataDiscoverySpec = null,
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.storage.Bucket;
import com.pulumi.gcp.storage.BucketArgs;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerOnDemandArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var tfTestBucket = new Bucket("tfTestBucket", BucketArgs.builder()
.name("tf-test-bucket-name-_72490")
.location("us-west1")
.uniformBucketLevelAccess(true)
.build());
var basicDiscovery = new Datascan("basicDiscovery", DatascanArgs.builder()
.location("us-central1")
.dataScanId("datadiscovery-basic")
.data(DatascanDataArgs.builder()
.resource(Output.tuple(tfTestBucket.project(), tfTestBucket.name()).applyValue(values -> {
var project = values.t1;
var name = values.t2;
return String.format("//storage.googleapis.com/projects/%s/buckets/%s", project,name);
}))
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.onDemand(DatascanExecutionSpecTriggerOnDemandArgs.builder()
.build())
.build())
.build())
.dataDiscoverySpec(DatascanDataDiscoverySpecArgs.builder()
.build())
.project("my-project-name")
.build());
}
}
resources:
basicDiscovery:
type: gcp:dataplex:Datascan
name: basic_discovery
properties:
location: us-central1
dataScanId: datadiscovery-basic
data:
resource: //storage.googleapis.com/projects/${tfTestBucket.project}/buckets/${tfTestBucket.name}
executionSpec:
trigger:
onDemand: {}
dataDiscoverySpec: {}
project: my-project-name
tfTestBucket:
type: gcp:storage:Bucket
name: tf_test_bucket
properties:
name: tf-test-bucket-name-_72490
location: us-west1
uniformBucketLevelAccess: true
The dataDiscoverySpec enables discovery mode. The data.resource property points to a Cloud Storage bucket using the resource URL format. Discovery scans sample files to detect formats (CSV, JSON, Parquet) and infer column types.
Publish discovered schemas to BigLake tables
After discovering schemas in Cloud Storage, teams often want to query the data through BigQuery using BigLake tables.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const tfTestBucket = new gcp.storage.Bucket("tf_test_bucket", {
name: "tf-test-bucket-name-_89605",
location: "us-west1",
uniformBucketLevelAccess: true,
});
const tfTestConnection = new gcp.bigquery.Connection("tf_test_connection", {
connectionId: "tf-test-connection-_56730",
location: "us-central1",
friendlyName: "tf-test-connection-_95154",
description: "a bigquery connection for tf test",
cloudResource: {},
});
const fullDiscovery = new gcp.dataplex.Datascan("full_discovery", {
location: "us-central1",
displayName: "Full Datascan Discovery",
dataScanId: "datadiscovery-full",
description: "Example resource - Full Datascan Discovery",
labels: {
author: "billing",
},
data: {
resource: pulumi.interpolate`//storage.googleapis.com/projects/${tfTestBucket.project}/buckets/${tfTestBucket.name}`,
},
executionSpec: {
trigger: {
schedule: {
cron: "TZ=America/New_York 1 1 * * *",
},
},
},
dataDiscoverySpec: {
bigqueryPublishingConfig: {
tableType: "BIGLAKE",
connection: pulumi.all([tfTestConnection.project, tfTestConnection.location, tfTestConnection.connectionId]).apply(([project, location, connectionId]) => `projects/${project}/locations/${location}/connections/${connectionId}`),
location: tfTestBucket.location,
project: pulumi.interpolate`projects/${tfTestBucket.project}`,
},
storageConfig: {
includePatterns: [
"ai*",
"ml*",
],
excludePatterns: [
"doc*",
"gen*",
],
csvOptions: {
headerRows: 5,
delimiter: ",",
encoding: "UTF-8",
typeInferenceDisabled: false,
quote: "'",
},
jsonOptions: {
encoding: "UTF-8",
typeInferenceDisabled: false,
},
},
},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
tf_test_bucket = gcp.storage.Bucket("tf_test_bucket",
name="tf-test-bucket-name-_89605",
location="us-west1",
uniform_bucket_level_access=True)
tf_test_connection = gcp.bigquery.Connection("tf_test_connection",
connection_id="tf-test-connection-_56730",
location="us-central1",
friendly_name="tf-test-connection-_95154",
description="a bigquery connection for tf test",
cloud_resource={})
full_discovery = gcp.dataplex.Datascan("full_discovery",
location="us-central1",
display_name="Full Datascan Discovery",
data_scan_id="datadiscovery-full",
description="Example resource - Full Datascan Discovery",
labels={
"author": "billing",
},
data={
"resource": pulumi.Output.all(
project=tf_test_bucket.project,
name=tf_test_bucket.name
).apply(lambda resolved_outputs: f"//storage.googleapis.com/projects/{resolved_outputs['project']}/buckets/{resolved_outputs['name']}")
,
},
execution_spec={
"trigger": {
"schedule": {
"cron": "TZ=America/New_York 1 1 * * *",
},
},
},
data_discovery_spec={
"bigquery_publishing_config": {
"table_type": "BIGLAKE",
"connection": pulumi.Output.all(
project=tf_test_connection.project,
location=tf_test_connection.location,
connection_id=tf_test_connection.connection_id
).apply(lambda resolved_outputs: f"projects/{resolved_outputs['project']}/locations/{resolved_outputs['location']}/connections/{resolved_outputs['connection_id']}")
,
"location": tf_test_bucket.location,
"project": tf_test_bucket.project.apply(lambda project: f"projects/{project}"),
},
"storage_config": {
"include_patterns": [
"ai*",
"ml*",
],
"exclude_patterns": [
"doc*",
"gen*",
],
"csv_options": {
"header_rows": 5,
"delimiter": ",",
"encoding": "UTF-8",
"type_inference_disabled": False,
"quote": "'",
},
"json_options": {
"encoding": "UTF-8",
"type_inference_disabled": False,
},
},
},
project="my-project-name")
package main
import (
"fmt"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/bigquery"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/storage"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
tfTestBucket, err := storage.NewBucket(ctx, "tf_test_bucket", &storage.BucketArgs{
Name: pulumi.String("tf-test-bucket-name-_89605"),
Location: pulumi.String("us-west1"),
UniformBucketLevelAccess: pulumi.Bool(true),
})
if err != nil {
return err
}
tfTestConnection, err := bigquery.NewConnection(ctx, "tf_test_connection", &bigquery.ConnectionArgs{
ConnectionId: pulumi.String("tf-test-connection-_56730"),
Location: pulumi.String("us-central1"),
FriendlyName: pulumi.String("tf-test-connection-_95154"),
Description: pulumi.String("a bigquery connection for tf test"),
CloudResource: &bigquery.ConnectionCloudResourceArgs{},
})
if err != nil {
return err
}
_, err = dataplex.NewDatascan(ctx, "full_discovery", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DisplayName: pulumi.String("Full Datascan Discovery"),
DataScanId: pulumi.String("datadiscovery-full"),
Description: pulumi.String("Example resource - Full Datascan Discovery"),
Labels: pulumi.StringMap{
"author": pulumi.String("billing"),
},
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.All(tfTestBucket.Project, tfTestBucket.Name).ApplyT(func(_args []interface{}) (string, error) {
project := _args[0].(string)
name := _args[1].(string)
return fmt.Sprintf("//storage.googleapis.com/projects/%v/buckets/%v", project, name), nil
}).(pulumi.StringOutput),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
Schedule: &dataplex.DatascanExecutionSpecTriggerScheduleArgs{
Cron: pulumi.String("TZ=America/New_York 1 1 * * *"),
},
},
},
DataDiscoverySpec: &dataplex.DatascanDataDiscoverySpecArgs{
BigqueryPublishingConfig: &dataplex.DatascanDataDiscoverySpecBigqueryPublishingConfigArgs{
TableType: pulumi.String("BIGLAKE"),
Connection: pulumi.All(tfTestConnection.Project, tfTestConnection.Location, tfTestConnection.ConnectionId).ApplyT(func(_args []interface{}) (string, error) {
project := _args[0].(string)
location := _args[1].(*string)
connectionId := _args[2].(string)
return fmt.Sprintf("projects/%v/locations/%v/connections/%v", project, location, connectionId), nil
}).(pulumi.StringOutput),
Location: tfTestBucket.Location,
Project: tfTestBucket.Project.ApplyT(func(project string) (string, error) {
return fmt.Sprintf("projects/%v", project), nil
}).(pulumi.StringOutput),
},
StorageConfig: &dataplex.DatascanDataDiscoverySpecStorageConfigArgs{
IncludePatterns: pulumi.StringArray{
pulumi.String("ai*"),
pulumi.String("ml*"),
},
ExcludePatterns: pulumi.StringArray{
pulumi.String("doc*"),
pulumi.String("gen*"),
},
CsvOptions: &dataplex.DatascanDataDiscoverySpecStorageConfigCsvOptionsArgs{
HeaderRows: pulumi.Int(5),
Delimiter: pulumi.String(","),
Encoding: pulumi.String("UTF-8"),
TypeInferenceDisabled: pulumi.Bool(false),
Quote: pulumi.String("'"),
},
JsonOptions: &dataplex.DatascanDataDiscoverySpecStorageConfigJsonOptionsArgs{
Encoding: pulumi.String("UTF-8"),
TypeInferenceDisabled: pulumi.Bool(false),
},
},
},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var tfTestBucket = new Gcp.Storage.Bucket("tf_test_bucket", new()
{
Name = "tf-test-bucket-name-_89605",
Location = "us-west1",
UniformBucketLevelAccess = true,
});
var tfTestConnection = new Gcp.BigQuery.Connection("tf_test_connection", new()
{
ConnectionId = "tf-test-connection-_56730",
Location = "us-central1",
FriendlyName = "tf-test-connection-_95154",
Description = "a bigquery connection for tf test",
CloudResource = null,
});
var fullDiscovery = new Gcp.DataPlex.Datascan("full_discovery", new()
{
Location = "us-central1",
DisplayName = "Full Datascan Discovery",
DataScanId = "datadiscovery-full",
Description = "Example resource - Full Datascan Discovery",
Labels =
{
{ "author", "billing" },
},
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = Output.Tuple(tfTestBucket.Project, tfTestBucket.Name).Apply(values =>
{
var project = values.Item1;
var name = values.Item2;
return $"//storage.googleapis.com/projects/{project}/buckets/{name}";
}),
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
Schedule = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerScheduleArgs
{
Cron = "TZ=America/New_York 1 1 * * *",
},
},
},
DataDiscoverySpec = new Gcp.DataPlex.Inputs.DatascanDataDiscoverySpecArgs
{
BigqueryPublishingConfig = new Gcp.DataPlex.Inputs.DatascanDataDiscoverySpecBigqueryPublishingConfigArgs
{
TableType = "BIGLAKE",
Connection = Output.Tuple(tfTestConnection.Project, tfTestConnection.Location, tfTestConnection.ConnectionId).Apply(values =>
{
var project = values.Item1;
var location = values.Item2;
var connectionId = values.Item3;
return $"projects/{project}/locations/{location}/connections/{connectionId}";
}),
Location = tfTestBucket.Location,
Project = tfTestBucket.Project.Apply(project => $"projects/{project}"),
},
StorageConfig = new Gcp.DataPlex.Inputs.DatascanDataDiscoverySpecStorageConfigArgs
{
IncludePatterns = new[]
{
"ai*",
"ml*",
},
ExcludePatterns = new[]
{
"doc*",
"gen*",
},
CsvOptions = new Gcp.DataPlex.Inputs.DatascanDataDiscoverySpecStorageConfigCsvOptionsArgs
{
HeaderRows = 5,
Delimiter = ",",
Encoding = "UTF-8",
TypeInferenceDisabled = false,
Quote = "'",
},
JsonOptions = new Gcp.DataPlex.Inputs.DatascanDataDiscoverySpecStorageConfigJsonOptionsArgs
{
Encoding = "UTF-8",
TypeInferenceDisabled = false,
},
},
},
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.storage.Bucket;
import com.pulumi.gcp.storage.BucketArgs;
import com.pulumi.gcp.bigquery.Connection;
import com.pulumi.gcp.bigquery.ConnectionArgs;
import com.pulumi.gcp.bigquery.inputs.ConnectionCloudResourceArgs;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerScheduleArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecBigqueryPublishingConfigArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecStorageConfigArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecStorageConfigCsvOptionsArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDiscoverySpecStorageConfigJsonOptionsArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var tfTestBucket = new Bucket("tfTestBucket", BucketArgs.builder()
.name("tf-test-bucket-name-_89605")
.location("us-west1")
.uniformBucketLevelAccess(true)
.build());
var tfTestConnection = new Connection("tfTestConnection", ConnectionArgs.builder()
.connectionId("tf-test-connection-_56730")
.location("us-central1")
.friendlyName("tf-test-connection-_95154")
.description("a bigquery connection for tf test")
.cloudResource(ConnectionCloudResourceArgs.builder()
.build())
.build());
var fullDiscovery = new Datascan("fullDiscovery", DatascanArgs.builder()
.location("us-central1")
.displayName("Full Datascan Discovery")
.dataScanId("datadiscovery-full")
.description("Example resource - Full Datascan Discovery")
.labels(Map.of("author", "billing"))
.data(DatascanDataArgs.builder()
.resource(Output.tuple(tfTestBucket.project(), tfTestBucket.name()).applyValue(values -> {
var project = values.t1;
var name = values.t2;
return String.format("//storage.googleapis.com/projects/%s/buckets/%s", project,name);
}))
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.schedule(DatascanExecutionSpecTriggerScheduleArgs.builder()
.cron("TZ=America/New_York 1 1 * * *")
.build())
.build())
.build())
.dataDiscoverySpec(DatascanDataDiscoverySpecArgs.builder()
.bigqueryPublishingConfig(DatascanDataDiscoverySpecBigqueryPublishingConfigArgs.builder()
.tableType("BIGLAKE")
.connection(Output.tuple(tfTestConnection.project(), tfTestConnection.location(), tfTestConnection.connectionId()).applyValue(values -> {
var project = values.t1;
var location = values.t2;
var connectionId = values.t3;
return String.format("projects/%s/locations/%s/connections/%s", project,location,connectionId);
}))
.location(tfTestBucket.location())
.project(tfTestBucket.project().applyValue(_project -> String.format("projects/%s", _project)))
.build())
.storageConfig(DatascanDataDiscoverySpecStorageConfigArgs.builder()
.includePatterns(
"ai*",
"ml*")
.excludePatterns(
"doc*",
"gen*")
.csvOptions(DatascanDataDiscoverySpecStorageConfigCsvOptionsArgs.builder()
.headerRows(5)
.delimiter(",")
.encoding("UTF-8")
.typeInferenceDisabled(false)
.quote("'")
.build())
.jsonOptions(DatascanDataDiscoverySpecStorageConfigJsonOptionsArgs.builder()
.encoding("UTF-8")
.typeInferenceDisabled(false)
.build())
.build())
.build())
.project("my-project-name")
.build());
}
}
resources:
fullDiscovery:
type: gcp:dataplex:Datascan
name: full_discovery
properties:
location: us-central1
displayName: Full Datascan Discovery
dataScanId: datadiscovery-full
description: Example resource - Full Datascan Discovery
labels:
author: billing
data:
resource: //storage.googleapis.com/projects/${tfTestBucket.project}/buckets/${tfTestBucket.name}
executionSpec:
trigger:
schedule:
cron: TZ=America/New_York 1 1 * * *
dataDiscoverySpec:
bigqueryPublishingConfig:
tableType: BIGLAKE
connection: projects/${tfTestConnection.project}/locations/${tfTestConnection.location}/connections/${tfTestConnection.connectionId}
location: ${tfTestBucket.location}
project: projects/${tfTestBucket.project}
storageConfig:
includePatterns:
- ai*
- ml*
excludePatterns:
- doc*
- gen*
csvOptions:
headerRows: 5
delimiter: ','
encoding: UTF-8
typeInferenceDisabled: false
quote: ''''
jsonOptions:
encoding: UTF-8
typeInferenceDisabled: false
project: my-project-name
tfTestBucket:
type: gcp:storage:Bucket
name: tf_test_bucket
properties:
name: tf-test-bucket-name-_89605
location: us-west1
uniformBucketLevelAccess: true
tfTestConnection:
type: gcp:bigquery:Connection
name: tf_test_connection
properties:
connectionId: tf-test-connection-_56730
location: us-central1
friendlyName: tf-test-connection-_95154
description: a bigquery connection for tf test
cloudResource: {}
The bigqueryPublishingConfig property creates BigLake tables from discovered schemas. The tableType property specifies BIGLAKE. The connection property references a BigQuery connection for external data access. The storageConfig property filters files using includePatterns and excludePatterns. The csvOptions and jsonOptions properties control format-specific parsing behavior like delimiters and encoding.
Generate documentation for BigQuery tables
Data catalogs benefit from automated documentation that captures table schemas and column descriptions.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const tfDataplexTestDataset = new gcp.bigquery.Dataset("tf_dataplex_test_dataset", {
datasetId: "tf_dataplex_test_dataset_id__64336",
defaultTableExpirationMs: 3600000,
});
const tfDataplexTestTable = new gcp.bigquery.Table("tf_dataplex_test_table", {
datasetId: tfDataplexTestDataset.datasetId,
tableId: "tf_dataplex_test_table_id__34962",
deletionProtection: false,
schema: ` [
{
\\"name\\": \\"name\\",
\\"type\\": \\"STRING\\",
\\"mode\\": \\"NULLABLE\\"
},
{
\\"name\\": \\"station_id\\",
\\"type\\": \\"INTEGER\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The id of the bike station\\"
},
{
\\"name\\": \\"address\\",
\\"type\\": \\"STRING\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The address of the bike station\\"
},
{
\\"name\\": \\"power_type\\",
\\"type\\": \\"STRING\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The powert type of the bike station\\"
},
{
\\"name\\": \\"property_type\\",
\\"type\\": \\"STRING\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The type of the property\\"
},
{
\\"name\\": \\"number_of_docks\\",
\\"type\\": \\"INTEGER\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The number of docks the property have\\"
},
{
\\"name\\": \\"footprint_length\\",
\\"type\\": \\"INTEGER\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The footpring lenght of the property\\"
},
{
\\"name\\": \\"council_district\\",
\\"type\\": \\"INTEGER\\",
\\"mode\\": \\"NULLABLE\\",
\\"description\\": \\"The council district the property is in\\"
}
]
`,
});
const documentation = new gcp.dataplex.Datascan("documentation", {
location: "us-central1",
dataScanId: "datadocumentation",
data: {
resource: pulumi.interpolate`//bigquery.googleapis.com/projects/my-project-name/datasets/${tfDataplexTestDataset.datasetId}/tables/${tfDataplexTestTable.tableId}`,
},
executionSpec: {
trigger: {
onDemand: {},
},
},
dataDocumentationSpec: {},
project: "my-project-name",
});
import pulumi
import pulumi_gcp as gcp
tf_dataplex_test_dataset = gcp.bigquery.Dataset("tf_dataplex_test_dataset",
dataset_id="tf_dataplex_test_dataset_id__64336",
default_table_expiration_ms=3600000)
tf_dataplex_test_table = gcp.bigquery.Table("tf_dataplex_test_table",
dataset_id=tf_dataplex_test_dataset.dataset_id,
table_id="tf_dataplex_test_table_id__34962",
deletion_protection=False,
schema=""" [
{
\"name\": \"name\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\"
},
{
\"name\": \"station_id\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The id of the bike station\"
},
{
\"name\": \"address\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The address of the bike station\"
},
{
\"name\": \"power_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The powert type of the bike station\"
},
{
\"name\": \"property_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The type of the property\"
},
{
\"name\": \"number_of_docks\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The number of docks the property have\"
},
{
\"name\": \"footprint_length\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The footpring lenght of the property\"
},
{
\"name\": \"council_district\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The council district the property is in\"
}
]
""")
documentation = gcp.dataplex.Datascan("documentation",
location="us-central1",
data_scan_id="datadocumentation",
data={
"resource": pulumi.Output.all(
dataset_id=tf_dataplex_test_dataset.dataset_id,
table_id=tf_dataplex_test_table.table_id
).apply(lambda resolved_outputs: f"//bigquery.googleapis.com/projects/my-project-name/datasets/{resolved_outputs['dataset_id']}/tables/{resolved_outputs['table_id']}")
,
},
execution_spec={
"trigger": {
"on_demand": {},
},
},
data_documentation_spec={},
project="my-project-name")
package main
import (
"fmt"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/bigquery"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataplex"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
tfDataplexTestDataset, err := bigquery.NewDataset(ctx, "tf_dataplex_test_dataset", &bigquery.DatasetArgs{
DatasetId: pulumi.String("tf_dataplex_test_dataset_id__64336"),
DefaultTableExpirationMs: pulumi.Int(3600000),
})
if err != nil {
return err
}
tfDataplexTestTable, err := bigquery.NewTable(ctx, "tf_dataplex_test_table", &bigquery.TableArgs{
DatasetId: tfDataplexTestDataset.DatasetId,
TableId: pulumi.String("tf_dataplex_test_table_id__34962"),
DeletionProtection: pulumi.Bool(false),
Schema: pulumi.String(` [
{
\"name\": \"name\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\"
},
{
\"name\": \"station_id\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The id of the bike station\"
},
{
\"name\": \"address\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The address of the bike station\"
},
{
\"name\": \"power_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The powert type of the bike station\"
},
{
\"name\": \"property_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The type of the property\"
},
{
\"name\": \"number_of_docks\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The number of docks the property have\"
},
{
\"name\": \"footprint_length\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The footpring lenght of the property\"
},
{
\"name\": \"council_district\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The council district the property is in\"
}
]
`),
})
if err != nil {
return err
}
_, err = dataplex.NewDatascan(ctx, "documentation", &dataplex.DatascanArgs{
Location: pulumi.String("us-central1"),
DataScanId: pulumi.String("datadocumentation"),
Data: &dataplex.DatascanDataArgs{
Resource: pulumi.All(tfDataplexTestDataset.DatasetId, tfDataplexTestTable.TableId).ApplyT(func(_args []interface{}) (string, error) {
datasetId := _args[0].(string)
tableId := _args[1].(string)
return fmt.Sprintf("//bigquery.googleapis.com/projects/my-project-name/datasets/%v/tables/%v", datasetId, tableId), nil
}).(pulumi.StringOutput),
},
ExecutionSpec: &dataplex.DatascanExecutionSpecArgs{
Trigger: &dataplex.DatascanExecutionSpecTriggerArgs{
OnDemand: &dataplex.DatascanExecutionSpecTriggerOnDemandArgs{},
},
},
DataDocumentationSpec: &dataplex.DatascanDataDocumentationSpecArgs{},
Project: pulumi.String("my-project-name"),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var tfDataplexTestDataset = new Gcp.BigQuery.Dataset("tf_dataplex_test_dataset", new()
{
DatasetId = "tf_dataplex_test_dataset_id__64336",
DefaultTableExpirationMs = 3600000,
});
var tfDataplexTestTable = new Gcp.BigQuery.Table("tf_dataplex_test_table", new()
{
DatasetId = tfDataplexTestDataset.DatasetId,
TableId = "tf_dataplex_test_table_id__34962",
DeletionProtection = false,
Schema = @" [
{
\""name\"": \""name\"",
\""type\"": \""STRING\"",
\""mode\"": \""NULLABLE\""
},
{
\""name\"": \""station_id\"",
\""type\"": \""INTEGER\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The id of the bike station\""
},
{
\""name\"": \""address\"",
\""type\"": \""STRING\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The address of the bike station\""
},
{
\""name\"": \""power_type\"",
\""type\"": \""STRING\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The powert type of the bike station\""
},
{
\""name\"": \""property_type\"",
\""type\"": \""STRING\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The type of the property\""
},
{
\""name\"": \""number_of_docks\"",
\""type\"": \""INTEGER\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The number of docks the property have\""
},
{
\""name\"": \""footprint_length\"",
\""type\"": \""INTEGER\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The footpring lenght of the property\""
},
{
\""name\"": \""council_district\"",
\""type\"": \""INTEGER\"",
\""mode\"": \""NULLABLE\"",
\""description\"": \""The council district the property is in\""
}
]
",
});
var documentation = new Gcp.DataPlex.Datascan("documentation", new()
{
Location = "us-central1",
DataScanId = "datadocumentation",
Data = new Gcp.DataPlex.Inputs.DatascanDataArgs
{
Resource = Output.Tuple(tfDataplexTestDataset.DatasetId, tfDataplexTestTable.TableId).Apply(values =>
{
var datasetId = values.Item1;
var tableId = values.Item2;
return $"//bigquery.googleapis.com/projects/my-project-name/datasets/{datasetId}/tables/{tableId}";
}),
},
ExecutionSpec = new Gcp.DataPlex.Inputs.DatascanExecutionSpecArgs
{
Trigger = new Gcp.DataPlex.Inputs.DatascanExecutionSpecTriggerArgs
{
OnDemand = null,
},
},
DataDocumentationSpec = null,
Project = "my-project-name",
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.bigquery.Dataset;
import com.pulumi.gcp.bigquery.DatasetArgs;
import com.pulumi.gcp.bigquery.Table;
import com.pulumi.gcp.bigquery.TableArgs;
import com.pulumi.gcp.dataplex.Datascan;
import com.pulumi.gcp.dataplex.DatascanArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanExecutionSpecTriggerOnDemandArgs;
import com.pulumi.gcp.dataplex.inputs.DatascanDataDocumentationSpecArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var tfDataplexTestDataset = new Dataset("tfDataplexTestDataset", DatasetArgs.builder()
.datasetId("tf_dataplex_test_dataset_id__64336")
.defaultTableExpirationMs(3600000)
.build());
var tfDataplexTestTable = new Table("tfDataplexTestTable", TableArgs.builder()
.datasetId(tfDataplexTestDataset.datasetId())
.tableId("tf_dataplex_test_table_id__34962")
.deletionProtection(false)
.schema("""
[
{
\"name\": \"name\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\"
},
{
\"name\": \"station_id\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The id of the bike station\"
},
{
\"name\": \"address\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The address of the bike station\"
},
{
\"name\": \"power_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The powert type of the bike station\"
},
{
\"name\": \"property_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The type of the property\"
},
{
\"name\": \"number_of_docks\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The number of docks the property have\"
},
{
\"name\": \"footprint_length\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The footpring lenght of the property\"
},
{
\"name\": \"council_district\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The council district the property is in\"
}
]
""")
.build());
var documentation = new Datascan("documentation", DatascanArgs.builder()
.location("us-central1")
.dataScanId("datadocumentation")
.data(DatascanDataArgs.builder()
.resource(Output.tuple(tfDataplexTestDataset.datasetId(), tfDataplexTestTable.tableId()).applyValue(values -> {
var datasetId = values.t1;
var tableId = values.t2;
return String.format("//bigquery.googleapis.com/projects/my-project-name/datasets/%s/tables/%s", datasetId,tableId);
}))
.build())
.executionSpec(DatascanExecutionSpecArgs.builder()
.trigger(DatascanExecutionSpecTriggerArgs.builder()
.onDemand(DatascanExecutionSpecTriggerOnDemandArgs.builder()
.build())
.build())
.build())
.dataDocumentationSpec(DatascanDataDocumentationSpecArgs.builder()
.build())
.project("my-project-name")
.build());
}
}
resources:
tfDataplexTestDataset:
type: gcp:bigquery:Dataset
name: tf_dataplex_test_dataset
properties:
datasetId: tf_dataplex_test_dataset_id__64336
defaultTableExpirationMs: 3.6e+06
tfDataplexTestTable:
type: gcp:bigquery:Table
name: tf_dataplex_test_table
properties:
datasetId: ${tfDataplexTestDataset.datasetId}
tableId: tf_dataplex_test_table_id__34962
deletionProtection: false
schema: |2
[
{
\"name\": \"name\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\"
},
{
\"name\": \"station_id\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The id of the bike station\"
},
{
\"name\": \"address\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The address of the bike station\"
},
{
\"name\": \"power_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The powert type of the bike station\"
},
{
\"name\": \"property_type\",
\"type\": \"STRING\",
\"mode\": \"NULLABLE\",
\"description\": \"The type of the property\"
},
{
\"name\": \"number_of_docks\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The number of docks the property have\"
},
{
\"name\": \"footprint_length\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The footpring lenght of the property\"
},
{
\"name\": \"council_district\",
\"type\": \"INTEGER\",
\"mode\": \"NULLABLE\",
\"description\": \"The council district the property is in\"
}
]
documentation:
type: gcp:dataplex:Datascan
properties:
location: us-central1
dataScanId: datadocumentation
data:
resource: //bigquery.googleapis.com/projects/my-project-name/datasets/${tfDataplexTestDataset.datasetId}/tables/${tfDataplexTestTable.tableId}
executionSpec:
trigger:
onDemand: {}
dataDocumentationSpec: {}
project: my-project-name
The dataDocumentationSpec enables documentation mode. Documentation scans extract metadata from BigQuery tables, including column names, types, and descriptions. This metadata can be published to Data Catalog for centralized discovery.
Beyond these examples
These snippets focus on specific datascan features: data profiling and quality validation, schema discovery and BigQuery publishing, and scheduled and on-demand execution. They’re intentionally minimal rather than full data governance solutions.
The examples may reference pre-existing infrastructure such as BigQuery datasets and tables, Cloud Storage buckets, and BigQuery connections for BigLake. They focus on configuring the datascan rather than provisioning data sources.
To keep things focused, common datascan patterns are omitted, including:
- IAM permissions for data access and scan execution
- Incremental scanning (executionSpec.field for timestamp-based filtering)
- Custom sampling strategies beyond percentage
- Integration with Data Catalog for metadata management
These omissions are intentional: the goal is to illustrate how each datascan feature is wired, not provide drop-in data quality modules. See the Dataplex Datascan resource reference for all available configuration options.
Let's configure GCP Dataplex Data Scans
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Setup
data (data source), dataScanId (identifier), location, and project. Plan these carefully during creation, as changing them requires recreating the resource.dataProfileSpec (data profiling), dataQualitySpec (quality validation), dataDiscoverySpec (schema discovery), or dataDocumentationSpec (documentation generation).//bigquery.googleapis.com/projects/{project}/datasets/{dataset}/tables/{table} in the data.resource field.//storage.googleapis.com/projects/{project}/buckets/{bucket} in the data.resource field.Scheduling & Execution
onDemand triggers require manual execution (configured as an empty object {}), while schedule triggers run automatically based on a cron expression.executionSpec.trigger.schedule.cron with a cron expression, such as TZ=America/New_York 1 1 * * * for daily execution at 1:01 AM.Data Quality & Profiling
nonNullExpectation, rangeExpectation, regexExpectation, setExpectation, uniquenessExpectation, statisticRangeExpectation, rowConditionExpectation, tableConditionExpectation, and sqlAssertion.rowFilter with a SQL expression (e.g., word_count > 10 in dataProfileSpec or station_id > 1000 in dataQualitySpec) to filter rows before scanning.includeFields.fieldNames to specify columns to include, or excludeFields.fieldNames to specify columns to exclude in dataProfileSpec.samplingPercent in dataProfileSpec or dataQualitySpec (e.g., 80 for 80% sampling, 5 for 5% sampling).Export & Publishing
dataProfileSpec.postScanActions.bigqueryExport.resultsTable with a BigQuery table reference in the format //bigquery.googleapis.com/projects/{project}/datasets/{dataset}/tables/{table}.catalogPublishingEnabled to true in dataProfileSpec or dataQualitySpec.dataQualitySpec.postScanActions.notificationReport.recipients.emails with email addresses, and configure scoreThresholdTrigger.scoreThreshold to trigger notifications when quality scores fall below the threshold.Labels & Metadata
labels field is non-authoritative and only manages labels defined in your configuration. Use the effectiveLabels output property to see all labels on the resource, including those set by other clients or services.Using a different cloud?
Explore analytics guides for other cloud providers: