The gcp:dataproc/metastoreService:MetastoreService resource, part of the Pulumi GCP provider, provisions a managed Hive metastore service that stores and serves metadata for data lakes and analytics workloads. This guide focuses on four capabilities: Hive version and tier configuration, customer-managed encryption, VPC networking and scheduled backups, and Spanner backend with autoscaling.
Metastores integrate with Dataproc clusters and other analytics tools, and may reference Cloud KMS keys, VPC subnets, and Cloud Storage buckets. The examples are intentionally small. Combine them with your own networking, encryption, and backup infrastructure.
Create a basic metastore with maintenance windows
Most deployments start with a minimal configuration that specifies the Hive version, service tier, and maintenance schedule.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const _default = new gcp.dataproc.MetastoreService("default", {
serviceId: "metastore-srv",
location: "us-central1",
port: 9080,
tier: "DEVELOPER",
maintenanceWindow: {
hourOfDay: 2,
dayOfWeek: "SUNDAY",
},
hiveMetastoreConfig: {
version: "2.3.6",
},
labels: {
env: "test",
},
});
import pulumi
import pulumi_gcp as gcp
default = gcp.dataproc.MetastoreService("default",
service_id="metastore-srv",
location="us-central1",
port=9080,
tier="DEVELOPER",
maintenance_window={
"hour_of_day": 2,
"day_of_week": "SUNDAY",
},
hive_metastore_config={
"version": "2.3.6",
},
labels={
"env": "test",
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewMetastoreService(ctx, "default", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("metastore-srv"),
Location: pulumi.String("us-central1"),
Port: pulumi.Int(9080),
Tier: pulumi.String("DEVELOPER"),
MaintenanceWindow: &dataproc.MetastoreServiceMaintenanceWindowArgs{
HourOfDay: pulumi.Int(2),
DayOfWeek: pulumi.String("SUNDAY"),
},
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("2.3.6"),
},
Labels: pulumi.StringMap{
"env": pulumi.String("test"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var @default = new Gcp.Dataproc.MetastoreService("default", new()
{
ServiceId = "metastore-srv",
Location = "us-central1",
Port = 9080,
Tier = "DEVELOPER",
MaintenanceWindow = new Gcp.Dataproc.Inputs.MetastoreServiceMaintenanceWindowArgs
{
HourOfDay = 2,
DayOfWeek = "SUNDAY",
},
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "2.3.6",
},
Labels =
{
{ "env", "test" },
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceMaintenanceWindowArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var default_ = new MetastoreService("default", MetastoreServiceArgs.builder()
.serviceId("metastore-srv")
.location("us-central1")
.port(9080)
.tier("DEVELOPER")
.maintenanceWindow(MetastoreServiceMaintenanceWindowArgs.builder()
.hourOfDay(2)
.dayOfWeek("SUNDAY")
.build())
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("2.3.6")
.build())
.labels(Map.of("env", "test"))
.build());
}
}
resources:
default:
type: gcp:dataproc:MetastoreService
properties:
serviceId: metastore-srv
location: us-central1
port: 9080
tier: DEVELOPER
maintenanceWindow:
hourOfDay: 2
dayOfWeek: SUNDAY
hiveMetastoreConfig:
version: 2.3.6
labels:
env: test
The hiveMetastoreConfig sets the Hive version (2.3.6 in this case). The tier property controls capacity and features; DEVELOPER is suitable for testing, while ENTERPRISE provides higher availability. The maintenanceWindow defines when the service can restart for updates, specified as a day of week and hour in UTC.
Encrypt metadata with customer-managed keys
Organizations with compliance requirements encrypt metastore data using customer-managed encryption keys from Cloud KMS.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const keyRing = new gcp.kms.KeyRing("key_ring", {
name: "example-keyring",
location: "us-central1",
});
const cryptoKey = new gcp.kms.CryptoKey("crypto_key", {
name: "example-key",
keyRing: keyRing.id,
purpose: "ENCRYPT_DECRYPT",
});
const _default = new gcp.dataproc.MetastoreService("default", {
serviceId: "example-service",
location: "us-central1",
encryptionConfig: {
kmsKey: cryptoKey.id,
},
hiveMetastoreConfig: {
version: "3.1.2",
},
});
import pulumi
import pulumi_gcp as gcp
key_ring = gcp.kms.KeyRing("key_ring",
name="example-keyring",
location="us-central1")
crypto_key = gcp.kms.CryptoKey("crypto_key",
name="example-key",
key_ring=key_ring.id,
purpose="ENCRYPT_DECRYPT")
default = gcp.dataproc.MetastoreService("default",
service_id="example-service",
location="us-central1",
encryption_config={
"kms_key": crypto_key.id,
},
hive_metastore_config={
"version": "3.1.2",
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/kms"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
keyRing, err := kms.NewKeyRing(ctx, "key_ring", &kms.KeyRingArgs{
Name: pulumi.String("example-keyring"),
Location: pulumi.String("us-central1"),
})
if err != nil {
return err
}
cryptoKey, err := kms.NewCryptoKey(ctx, "crypto_key", &kms.CryptoKeyArgs{
Name: pulumi.String("example-key"),
KeyRing: keyRing.ID(),
Purpose: pulumi.String("ENCRYPT_DECRYPT"),
})
if err != nil {
return err
}
_, err = dataproc.NewMetastoreService(ctx, "default", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("example-service"),
Location: pulumi.String("us-central1"),
EncryptionConfig: &dataproc.MetastoreServiceEncryptionConfigArgs{
KmsKey: cryptoKey.ID(),
},
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("3.1.2"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var keyRing = new Gcp.Kms.KeyRing("key_ring", new()
{
Name = "example-keyring",
Location = "us-central1",
});
var cryptoKey = new Gcp.Kms.CryptoKey("crypto_key", new()
{
Name = "example-key",
KeyRing = keyRing.Id,
Purpose = "ENCRYPT_DECRYPT",
});
var @default = new Gcp.Dataproc.MetastoreService("default", new()
{
ServiceId = "example-service",
Location = "us-central1",
EncryptionConfig = new Gcp.Dataproc.Inputs.MetastoreServiceEncryptionConfigArgs
{
KmsKey = cryptoKey.Id,
},
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "3.1.2",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.kms.KeyRing;
import com.pulumi.gcp.kms.KeyRingArgs;
import com.pulumi.gcp.kms.CryptoKey;
import com.pulumi.gcp.kms.CryptoKeyArgs;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceEncryptionConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var keyRing = new KeyRing("keyRing", KeyRingArgs.builder()
.name("example-keyring")
.location("us-central1")
.build());
var cryptoKey = new CryptoKey("cryptoKey", CryptoKeyArgs.builder()
.name("example-key")
.keyRing(keyRing.id())
.purpose("ENCRYPT_DECRYPT")
.build());
var default_ = new MetastoreService("default", MetastoreServiceArgs.builder()
.serviceId("example-service")
.location("us-central1")
.encryptionConfig(MetastoreServiceEncryptionConfigArgs.builder()
.kmsKey(cryptoKey.id())
.build())
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("3.1.2")
.build())
.build());
}
}
resources:
default:
type: gcp:dataproc:MetastoreService
properties:
serviceId: example-service
location: us-central1
encryptionConfig:
kmsKey: ${cryptoKey.id}
hiveMetastoreConfig:
version: 3.1.2
cryptoKey:
type: gcp:kms:CryptoKey
name: crypto_key
properties:
name: example-key
keyRing: ${keyRing.id}
purpose: ENCRYPT_DECRYPT
keyRing:
type: gcp:kms:KeyRing
name: key_ring
properties:
name: example-keyring
location: us-central1
The encryptionConfig property points to a Cloud KMS CryptoKey. The metastore service encrypts all metadata at rest using this key. You must grant the metastore service account permission to use the key for encryption and decryption operations.
Connect to a private VPC subnet
Production metastores typically run in private networks to restrict access and integrate with VPC resources like Dataproc clusters.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const net = new gcp.compute.Network("net", {
name: "my-network",
autoCreateSubnetworks: false,
});
const subnet = new gcp.compute.Subnetwork("subnet", {
name: "my-subnetwork",
region: "us-central1",
network: net.id,
ipCidrRange: "10.0.0.0/22",
privateIpGoogleAccess: true,
});
const _default = new gcp.dataproc.MetastoreService("default", {
serviceId: "metastore-srv",
location: "us-central1",
tier: "DEVELOPER",
hiveMetastoreConfig: {
version: "3.1.2",
},
networkConfig: {
consumers: [{
subnetwork: subnet.id,
}],
},
});
import pulumi
import pulumi_gcp as gcp
net = gcp.compute.Network("net",
name="my-network",
auto_create_subnetworks=False)
subnet = gcp.compute.Subnetwork("subnet",
name="my-subnetwork",
region="us-central1",
network=net.id,
ip_cidr_range="10.0.0.0/22",
private_ip_google_access=True)
default = gcp.dataproc.MetastoreService("default",
service_id="metastore-srv",
location="us-central1",
tier="DEVELOPER",
hive_metastore_config={
"version": "3.1.2",
},
network_config={
"consumers": [{
"subnetwork": subnet.id,
}],
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/compute"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
net, err := compute.NewNetwork(ctx, "net", &compute.NetworkArgs{
Name: pulumi.String("my-network"),
AutoCreateSubnetworks: pulumi.Bool(false),
})
if err != nil {
return err
}
subnet, err := compute.NewSubnetwork(ctx, "subnet", &compute.SubnetworkArgs{
Name: pulumi.String("my-subnetwork"),
Region: pulumi.String("us-central1"),
Network: net.ID(),
IpCidrRange: pulumi.String("10.0.0.0/22"),
PrivateIpGoogleAccess: pulumi.Bool(true),
})
if err != nil {
return err
}
_, err = dataproc.NewMetastoreService(ctx, "default", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("metastore-srv"),
Location: pulumi.String("us-central1"),
Tier: pulumi.String("DEVELOPER"),
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("3.1.2"),
},
NetworkConfig: &dataproc.MetastoreServiceNetworkConfigArgs{
Consumers: dataproc.MetastoreServiceNetworkConfigConsumerArray{
&dataproc.MetastoreServiceNetworkConfigConsumerArgs{
Subnetwork: subnet.ID(),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var net = new Gcp.Compute.Network("net", new()
{
Name = "my-network",
AutoCreateSubnetworks = false,
});
var subnet = new Gcp.Compute.Subnetwork("subnet", new()
{
Name = "my-subnetwork",
Region = "us-central1",
Network = net.Id,
IpCidrRange = "10.0.0.0/22",
PrivateIpGoogleAccess = true,
});
var @default = new Gcp.Dataproc.MetastoreService("default", new()
{
ServiceId = "metastore-srv",
Location = "us-central1",
Tier = "DEVELOPER",
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "3.1.2",
},
NetworkConfig = new Gcp.Dataproc.Inputs.MetastoreServiceNetworkConfigArgs
{
Consumers = new[]
{
new Gcp.Dataproc.Inputs.MetastoreServiceNetworkConfigConsumerArgs
{
Subnetwork = subnet.Id,
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.compute.Network;
import com.pulumi.gcp.compute.NetworkArgs;
import com.pulumi.gcp.compute.Subnetwork;
import com.pulumi.gcp.compute.SubnetworkArgs;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceNetworkConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var net = new Network("net", NetworkArgs.builder()
.name("my-network")
.autoCreateSubnetworks(false)
.build());
var subnet = new Subnetwork("subnet", SubnetworkArgs.builder()
.name("my-subnetwork")
.region("us-central1")
.network(net.id())
.ipCidrRange("10.0.0.0/22")
.privateIpGoogleAccess(true)
.build());
var default_ = new MetastoreService("default", MetastoreServiceArgs.builder()
.serviceId("metastore-srv")
.location("us-central1")
.tier("DEVELOPER")
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("3.1.2")
.build())
.networkConfig(MetastoreServiceNetworkConfigArgs.builder()
.consumers(MetastoreServiceNetworkConfigConsumerArgs.builder()
.subnetwork(subnet.id())
.build())
.build())
.build());
}
}
resources:
net:
type: gcp:compute:Network
properties:
name: my-network
autoCreateSubnetworks: false
subnet:
type: gcp:compute:Subnetwork
properties:
name: my-subnetwork
region: us-central1
network: ${net.id}
ipCidrRange: 10.0.0.0/22
privateIpGoogleAccess: true
default:
type: gcp:dataproc:MetastoreService
properties:
serviceId: metastore-srv
location: us-central1
tier: DEVELOPER
hiveMetastoreConfig:
version: 3.1.2
networkConfig:
consumers:
- subnetwork: ${subnet.id}
The networkConfig property places the metastore in a specified subnet. The consumers array lists subnetworks that can access the service. This example creates a VPC and subnet inline; in practice, you’d reference existing network infrastructure.
Schedule automated backups to Cloud Storage
Teams running production metastores configure automated backups for disaster recovery and point-in-time restoration.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const bucket = new gcp.storage.Bucket("bucket", {
name: "backup",
location: "us-central1",
});
const backup = new gcp.dataproc.MetastoreService("backup", {
serviceId: "backup",
location: "us-central1",
port: 9080,
tier: "DEVELOPER",
maintenanceWindow: {
hourOfDay: 2,
dayOfWeek: "SUNDAY",
},
hiveMetastoreConfig: {
version: "2.3.6",
},
scheduledBackup: {
enabled: true,
cronSchedule: "0 0 * * *",
timeZone: "UTC",
backupLocation: pulumi.interpolate`gs://${bucket.name}`,
},
labels: {
env: "test",
},
});
import pulumi
import pulumi_gcp as gcp
bucket = gcp.storage.Bucket("bucket",
name="backup",
location="us-central1")
backup = gcp.dataproc.MetastoreService("backup",
service_id="backup",
location="us-central1",
port=9080,
tier="DEVELOPER",
maintenance_window={
"hour_of_day": 2,
"day_of_week": "SUNDAY",
},
hive_metastore_config={
"version": "2.3.6",
},
scheduled_backup={
"enabled": True,
"cron_schedule": "0 0 * * *",
"time_zone": "UTC",
"backup_location": bucket.name.apply(lambda name: f"gs://{name}"),
},
labels={
"env": "test",
})
package main
import (
"fmt"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/storage"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
bucket, err := storage.NewBucket(ctx, "bucket", &storage.BucketArgs{
Name: pulumi.String("backup"),
Location: pulumi.String("us-central1"),
})
if err != nil {
return err
}
_, err = dataproc.NewMetastoreService(ctx, "backup", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("backup"),
Location: pulumi.String("us-central1"),
Port: pulumi.Int(9080),
Tier: pulumi.String("DEVELOPER"),
MaintenanceWindow: &dataproc.MetastoreServiceMaintenanceWindowArgs{
HourOfDay: pulumi.Int(2),
DayOfWeek: pulumi.String("SUNDAY"),
},
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("2.3.6"),
},
ScheduledBackup: &dataproc.MetastoreServiceScheduledBackupArgs{
Enabled: pulumi.Bool(true),
CronSchedule: pulumi.String("0 0 * * *"),
TimeZone: pulumi.String("UTC"),
BackupLocation: bucket.Name.ApplyT(func(name string) (string, error) {
return fmt.Sprintf("gs://%v", name), nil
}).(pulumi.StringOutput),
},
Labels: pulumi.StringMap{
"env": pulumi.String("test"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var bucket = new Gcp.Storage.Bucket("bucket", new()
{
Name = "backup",
Location = "us-central1",
});
var backup = new Gcp.Dataproc.MetastoreService("backup", new()
{
ServiceId = "backup",
Location = "us-central1",
Port = 9080,
Tier = "DEVELOPER",
MaintenanceWindow = new Gcp.Dataproc.Inputs.MetastoreServiceMaintenanceWindowArgs
{
HourOfDay = 2,
DayOfWeek = "SUNDAY",
},
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "2.3.6",
},
ScheduledBackup = new Gcp.Dataproc.Inputs.MetastoreServiceScheduledBackupArgs
{
Enabled = true,
CronSchedule = "0 0 * * *",
TimeZone = "UTC",
BackupLocation = bucket.Name.Apply(name => $"gs://{name}"),
},
Labels =
{
{ "env", "test" },
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.storage.Bucket;
import com.pulumi.gcp.storage.BucketArgs;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceMaintenanceWindowArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceScheduledBackupArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var bucket = new Bucket("bucket", BucketArgs.builder()
.name("backup")
.location("us-central1")
.build());
var backup = new MetastoreService("backup", MetastoreServiceArgs.builder()
.serviceId("backup")
.location("us-central1")
.port(9080)
.tier("DEVELOPER")
.maintenanceWindow(MetastoreServiceMaintenanceWindowArgs.builder()
.hourOfDay(2)
.dayOfWeek("SUNDAY")
.build())
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("2.3.6")
.build())
.scheduledBackup(MetastoreServiceScheduledBackupArgs.builder()
.enabled(true)
.cronSchedule("0 0 * * *")
.timeZone("UTC")
.backupLocation(bucket.name().applyValue(_name -> String.format("gs://%s", _name)))
.build())
.labels(Map.of("env", "test"))
.build());
}
}
resources:
backup:
type: gcp:dataproc:MetastoreService
properties:
serviceId: backup
location: us-central1
port: 9080
tier: DEVELOPER
maintenanceWindow:
hourOfDay: 2
dayOfWeek: SUNDAY
hiveMetastoreConfig:
version: 2.3.6
scheduledBackup:
enabled: true
cronSchedule: 0 0 * * *
timeZone: UTC
backupLocation: gs://${bucket.name}
labels:
env: test
bucket:
type: gcp:storage:Bucket
properties:
name: backup
location: us-central1
The scheduledBackup property enables daily backups using a cron schedule. The backupLocation points to a Cloud Storage bucket where backup data is written. The metastore service account needs write permissions to the bucket. Backups capture metadata state for restoration if needed.
Use Spanner as the metadata backend
For higher availability and scalability, metastores can use Cloud Spanner instead of the default MySQL backend.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const dpms2 = new gcp.dataproc.MetastoreService("dpms2", {
serviceId: "ms-dpms2",
location: "us-central1",
databaseType: "SPANNER",
hiveMetastoreConfig: {
version: "3.1.2",
},
scalingConfig: {
instanceSize: "EXTRA_SMALL",
},
});
import pulumi
import pulumi_gcp as gcp
dpms2 = gcp.dataproc.MetastoreService("dpms2",
service_id="ms-dpms2",
location="us-central1",
database_type="SPANNER",
hive_metastore_config={
"version": "3.1.2",
},
scaling_config={
"instance_size": "EXTRA_SMALL",
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewMetastoreService(ctx, "dpms2", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("ms-dpms2"),
Location: pulumi.String("us-central1"),
DatabaseType: pulumi.String("SPANNER"),
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("3.1.2"),
},
ScalingConfig: &dataproc.MetastoreServiceScalingConfigArgs{
InstanceSize: pulumi.String("EXTRA_SMALL"),
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var dpms2 = new Gcp.Dataproc.MetastoreService("dpms2", new()
{
ServiceId = "ms-dpms2",
Location = "us-central1",
DatabaseType = "SPANNER",
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "3.1.2",
},
ScalingConfig = new Gcp.Dataproc.Inputs.MetastoreServiceScalingConfigArgs
{
InstanceSize = "EXTRA_SMALL",
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceScalingConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var dpms2 = new MetastoreService("dpms2", MetastoreServiceArgs.builder()
.serviceId("ms-dpms2")
.location("us-central1")
.databaseType("SPANNER")
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("3.1.2")
.build())
.scalingConfig(MetastoreServiceScalingConfigArgs.builder()
.instanceSize("EXTRA_SMALL")
.build())
.build());
}
}
resources:
dpms2:
type: gcp:dataproc:MetastoreService
properties:
serviceId: ms-dpms2
location: us-central1
databaseType: SPANNER
hiveMetastoreConfig:
version: 3.1.2
scalingConfig:
instanceSize: EXTRA_SMALL
The databaseType property switches from MySQL to Spanner. Spanner-backed metastores use scalingConfig to define capacity; instanceSize sets the initial size. Spanner provides better scalability and eliminates the need for maintenance windows.
Configure autoscaling with capacity bounds
Spanner-backed metastores support autoscaling to adjust capacity based on load, with configurable minimum and maximum scaling factors.
import * as pulumi from "@pulumi/pulumi";
import * as gcp from "@pulumi/gcp";
const testResource = new gcp.dataproc.MetastoreService("test_resource", {
serviceId: "test-service",
location: "us-central1",
databaseType: "SPANNER",
hiveMetastoreConfig: {
version: "3.1.2",
},
scalingConfig: {
autoscalingConfig: {
autoscalingEnabled: true,
limitConfig: {
minScalingFactor: 0.1,
maxScalingFactor: 1,
},
},
},
});
import pulumi
import pulumi_gcp as gcp
test_resource = gcp.dataproc.MetastoreService("test_resource",
service_id="test-service",
location="us-central1",
database_type="SPANNER",
hive_metastore_config={
"version": "3.1.2",
},
scaling_config={
"autoscaling_config": {
"autoscaling_enabled": True,
"limit_config": {
"min_scaling_factor": 0.1,
"max_scaling_factor": 1,
},
},
})
package main
import (
"github.com/pulumi/pulumi-gcp/sdk/v9/go/gcp/dataproc"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := dataproc.NewMetastoreService(ctx, "test_resource", &dataproc.MetastoreServiceArgs{
ServiceId: pulumi.String("test-service"),
Location: pulumi.String("us-central1"),
DatabaseType: pulumi.String("SPANNER"),
HiveMetastoreConfig: &dataproc.MetastoreServiceHiveMetastoreConfigArgs{
Version: pulumi.String("3.1.2"),
},
ScalingConfig: &dataproc.MetastoreServiceScalingConfigArgs{
AutoscalingConfig: &dataproc.MetastoreServiceScalingConfigAutoscalingConfigArgs{
AutoscalingEnabled: pulumi.Bool(true),
LimitConfig: &dataproc.MetastoreServiceScalingConfigAutoscalingConfigLimitConfigArgs{
MinScalingFactor: pulumi.Float64(0.1),
MaxScalingFactor: pulumi.Float64(1),
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Gcp = Pulumi.Gcp;
return await Deployment.RunAsync(() =>
{
var testResource = new Gcp.Dataproc.MetastoreService("test_resource", new()
{
ServiceId = "test-service",
Location = "us-central1",
DatabaseType = "SPANNER",
HiveMetastoreConfig = new Gcp.Dataproc.Inputs.MetastoreServiceHiveMetastoreConfigArgs
{
Version = "3.1.2",
},
ScalingConfig = new Gcp.Dataproc.Inputs.MetastoreServiceScalingConfigArgs
{
AutoscalingConfig = new Gcp.Dataproc.Inputs.MetastoreServiceScalingConfigAutoscalingConfigArgs
{
AutoscalingEnabled = true,
LimitConfig = new Gcp.Dataproc.Inputs.MetastoreServiceScalingConfigAutoscalingConfigLimitConfigArgs
{
MinScalingFactor = 0.1,
MaxScalingFactor = 1,
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.gcp.dataproc.MetastoreService;
import com.pulumi.gcp.dataproc.MetastoreServiceArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceHiveMetastoreConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceScalingConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceScalingConfigAutoscalingConfigArgs;
import com.pulumi.gcp.dataproc.inputs.MetastoreServiceScalingConfigAutoscalingConfigLimitConfigArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var testResource = new MetastoreService("testResource", MetastoreServiceArgs.builder()
.serviceId("test-service")
.location("us-central1")
.databaseType("SPANNER")
.hiveMetastoreConfig(MetastoreServiceHiveMetastoreConfigArgs.builder()
.version("3.1.2")
.build())
.scalingConfig(MetastoreServiceScalingConfigArgs.builder()
.autoscalingConfig(MetastoreServiceScalingConfigAutoscalingConfigArgs.builder()
.autoscalingEnabled(true)
.limitConfig(MetastoreServiceScalingConfigAutoscalingConfigLimitConfigArgs.builder()
.minScalingFactor(0.1)
.maxScalingFactor(1.0)
.build())
.build())
.build())
.build());
}
}
resources:
testResource:
type: gcp:dataproc:MetastoreService
name: test_resource
properties:
serviceId: test-service
location: us-central1
databaseType: SPANNER
hiveMetastoreConfig:
version: 3.1.2
scalingConfig:
autoscalingConfig:
autoscalingEnabled: true
limitConfig:
minScalingFactor: 0.1
maxScalingFactor: 1
The autoscalingConfig enables automatic capacity adjustment. The limitConfig sets bounds: minScalingFactor defines the lowest capacity (0.1 = 10% of base), maxScalingFactor sets the ceiling (1.0 = 100% of base). Autoscaling only works with Spanner backends; MySQL-backed services use fixed capacity.
Beyond these examples
These snippets focus on specific metastore service features: Hive version and service tier selection, encryption and VPC networking, and Spanner backend with autoscaling. They’re intentionally minimal rather than full data platform deployments.
The examples may reference pre-existing infrastructure such as Cloud KMS keys for encryption, VPC networks and subnets when using Private Service Connect, and Cloud Storage buckets for backups. They focus on configuring the metastore service rather than provisioning everything around it.
To keep things focused, common metastore patterns are omitted, including:
- Metadata integration with external systems (metadataIntegration)
- Custom routes for Private Service Connect (customRoutesEnabled)
- Telemetry configuration (telemetryConfig)
- Release channel selection (releaseChannel)
- Deletion protection (deletionProtection)
These omissions are intentional: the goal is to illustrate how each metastore feature is wired, not provide drop-in data platform modules. See the Dataproc Metastore Service resource reference for all available configuration options.
Let's configure GCP Dataproc Metastore Services
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Immutability
serviceId, project, location, network, networkConfig, databaseType, releaseChannel, and tags.serviceId must be 3-63 characters long, contain only letters (a-z, A-Z), numbers (0-9), underscores (_), and hyphens (-), and cannot begin or end with an underscore or hyphen.labels field is non-authoritative and only manages labels present in your configuration. Use effectiveLabels to see all labels on the resource, including those set by other clients and services.Database & Storage
maintenanceWindow configuration, while MYSQL does. The databaseType is immutable after creation.encryptionConfig with a kmsKey pointing to your Cloud KMS CryptoKey ID.scheduledBackup with enabled set to true, a cronSchedule (e.g., “0 0 * * *”), timeZone (e.g., “UTC”), and backupLocation pointing to a GCS bucket (e.g., gs://bucket-name).Networking
networkConfig with consumers containing the subnetwork ID. Optionally, set customRoutesEnabled to true for custom routing.Scaling & Protection
instanceSize (e.g., EXTRA_SMALL), scalingFactor (a numeric multiplier), or autoscalingConfig with limitConfig for dynamic scaling with min/max scaling factors.deletionProtection to true to prevent accidental deletions.port is 9083, and the default location is “global”.Using a different cloud?
Explore analytics guides for other cloud providers: