The aws:emr/cluster:Cluster resource, part of the Pulumi AWS provider, provisions an EMR cluster: its node configuration, applications, networking, and IAM roles. This guide focuses on four capabilities: instance groups and fleets, autoscaling policies, debug logging, and high availability with multiple master nodes.
EMR clusters require IAM service roles, instance profiles, VPC infrastructure (subnets, security groups), and S3 buckets for logs and bootstrap scripts. The examples are intentionally small. Combine them with your own IAM roles, VPC configuration, and application-specific settings.
Launch a cluster with instance groups and autoscaling
Most deployments organize EC2 instances into master and core node groups, with autoscaling enabled to handle variable workloads.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const cluster = new aws.emr.Cluster("cluster", {
name: "emr-test-arn",
releaseLabel: "emr-4.6.0",
applications: ["Spark"],
additionalInfo: `{
\\"instanceAwsClientConfiguration\\": {
\\"proxyPort\\": 8099,
\\"proxyHost\\": \\"myproxy.example.com\\"
}
}
`,
terminationProtection: false,
keepJobFlowAliveWhenNoSteps: true,
ec2Attributes: {
subnetId: main.id,
emrManagedMasterSecurityGroup: sg.id,
emrManagedSlaveSecurityGroup: sg.id,
instanceProfile: emrProfile.arn,
},
masterInstanceGroup: {
instanceType: "m4.large",
},
coreInstanceGroup: {
instanceType: "c4.large",
instanceCount: 1,
ebsConfigs: [{
size: 40,
type: "gp2",
volumesPerInstance: 1,
}],
bidPrice: "0.30",
autoscalingPolicy: `{
\\"Constraints\\": {
\\"MinCapacity\\": 1,
\\"MaxCapacity\\": 2
},
\\"Rules\\": [
{
\\"Name\\": \\"ScaleOutMemoryPercentage\\",
\\"Description\\": \\"Scale out if YARNMemoryAvailablePercentage is less than 15\\",
\\"Action\\": {
\\"SimpleScalingPolicyConfiguration\\": {
\\"AdjustmentType\\": \\"CHANGE_IN_CAPACITY\\",
\\"ScalingAdjustment\\": 1,
\\"CoolDown\\": 300
}
},
\\"Trigger\\": {
\\"CloudWatchAlarmDefinition\\": {
\\"ComparisonOperator\\": \\"LESS_THAN\\",
\\"EvaluationPeriods\\": 1,
\\"MetricName\\": \\"YARNMemoryAvailablePercentage\\",
\\"Namespace\\": \\"AWS/ElasticMapReduce\\",
\\"Period\\": 300,
\\"Statistic\\": \\"AVERAGE\\",
\\"Threshold\\": 15.0,
\\"Unit\\": \\"PERCENT\\"
}
}
}
]
}
`,
},
ebsRootVolumeSize: 100,
tags: {
role: "rolename",
env: "env",
},
bootstrapActions: [{
path: "s3://elasticmapreduce/bootstrap-actions/run-if",
name: "runif",
args: [
"instance.isMaster=true",
"echo running on master node",
],
}],
configurationsJson: ` [
{
\\"Classification\\": \\"hadoop-env\\",
\\"Configurations\\": [
{
\\"Classification\\": \\"export\\",
\\"Properties\\": {
\\"JAVA_HOME\\": \\"/usr/lib/jvm/java-1.8.0\\"
}
}
],
\\"Properties\\": {}
},
{
\\"Classification\\": \\"spark-env\\",
\\"Configurations\\": [
{
\\"Classification\\": \\"export\\",
\\"Properties\\": {
\\"JAVA_HOME\\": \\"/usr/lib/jvm/java-1.8.0\\"
}
}
],
\\"Properties\\": {}
}
]
`,
serviceRole: iamEmrServiceRole.arn,
});
import pulumi
import pulumi_aws as aws
cluster = aws.emr.Cluster("cluster",
name="emr-test-arn",
release_label="emr-4.6.0",
applications=["Spark"],
additional_info="""{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
""",
termination_protection=False,
keep_job_flow_alive_when_no_steps=True,
ec2_attributes={
"subnet_id": main["id"],
"emr_managed_master_security_group": sg["id"],
"emr_managed_slave_security_group": sg["id"],
"instance_profile": emr_profile["arn"],
},
master_instance_group={
"instance_type": "m4.large",
},
core_instance_group={
"instance_type": "c4.large",
"instance_count": 1,
"ebs_configs": [{
"size": 40,
"type": "gp2",
"volumes_per_instance": 1,
}],
"bid_price": "0.30",
"autoscaling_policy": """{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
""",
},
ebs_root_volume_size=100,
tags={
"role": "rolename",
"env": "env",
},
bootstrap_actions=[{
"path": "s3://elasticmapreduce/bootstrap-actions/run-if",
"name": "runif",
"args": [
"instance.isMaster=true",
"echo running on master node",
],
}],
configurations_json=""" [
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
""",
service_role=iam_emr_service_role["arn"])
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := emr.NewCluster(ctx, "cluster", &emr.ClusterArgs{
Name: pulumi.String("emr-test-arn"),
ReleaseLabel: pulumi.String("emr-4.6.0"),
Applications: pulumi.StringArray{
pulumi.String("Spark"),
},
AdditionalInfo: pulumi.String(`{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
`),
TerminationProtection: pulumi.Bool(false),
KeepJobFlowAliveWhenNoSteps: pulumi.Bool(true),
Ec2Attributes: &emr.ClusterEc2AttributesArgs{
SubnetId: pulumi.Any(main.Id),
EmrManagedMasterSecurityGroup: pulumi.Any(sg.Id),
EmrManagedSlaveSecurityGroup: pulumi.Any(sg.Id),
InstanceProfile: pulumi.Any(emrProfile.Arn),
},
MasterInstanceGroup: &emr.ClusterMasterInstanceGroupArgs{
InstanceType: pulumi.String("m4.large"),
},
CoreInstanceGroup: &emr.ClusterCoreInstanceGroupArgs{
InstanceType: pulumi.String("c4.large"),
InstanceCount: pulumi.Int(1),
EbsConfigs: emr.ClusterCoreInstanceGroupEbsConfigArray{
&emr.ClusterCoreInstanceGroupEbsConfigArgs{
Size: pulumi.Int(40),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
BidPrice: pulumi.String("0.30"),
AutoscalingPolicy: pulumi.String(`{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
`),
},
EbsRootVolumeSize: pulumi.Int(100),
Tags: pulumi.StringMap{
"role": pulumi.String("rolename"),
"env": pulumi.String("env"),
},
BootstrapActions: emr.ClusterBootstrapActionArray{
&emr.ClusterBootstrapActionArgs{
Path: pulumi.String("s3://elasticmapreduce/bootstrap-actions/run-if"),
Name: pulumi.String("runif"),
Args: pulumi.StringArray{
pulumi.String("instance.isMaster=true"),
pulumi.String("echo running on master node"),
},
},
},
ConfigurationsJson: pulumi.String(` [
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
`),
ServiceRole: pulumi.Any(iamEmrServiceRole.Arn),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var cluster = new Aws.Emr.Cluster("cluster", new()
{
Name = "emr-test-arn",
ReleaseLabel = "emr-4.6.0",
Applications = new[]
{
"Spark",
},
AdditionalInfo = @"{
\""instanceAwsClientConfiguration\"": {
\""proxyPort\"": 8099,
\""proxyHost\"": \""myproxy.example.com\""
}
}
",
TerminationProtection = false,
KeepJobFlowAliveWhenNoSteps = true,
Ec2Attributes = new Aws.Emr.Inputs.ClusterEc2AttributesArgs
{
SubnetId = main.Id,
EmrManagedMasterSecurityGroup = sg.Id,
EmrManagedSlaveSecurityGroup = sg.Id,
InstanceProfile = emrProfile.Arn,
},
MasterInstanceGroup = new Aws.Emr.Inputs.ClusterMasterInstanceGroupArgs
{
InstanceType = "m4.large",
},
CoreInstanceGroup = new Aws.Emr.Inputs.ClusterCoreInstanceGroupArgs
{
InstanceType = "c4.large",
InstanceCount = 1,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceGroupEbsConfigArgs
{
Size = 40,
Type = "gp2",
VolumesPerInstance = 1,
},
},
BidPrice = "0.30",
AutoscalingPolicy = @"{
\""Constraints\"": {
\""MinCapacity\"": 1,
\""MaxCapacity\"": 2
},
\""Rules\"": [
{
\""Name\"": \""ScaleOutMemoryPercentage\"",
\""Description\"": \""Scale out if YARNMemoryAvailablePercentage is less than 15\"",
\""Action\"": {
\""SimpleScalingPolicyConfiguration\"": {
\""AdjustmentType\"": \""CHANGE_IN_CAPACITY\"",
\""ScalingAdjustment\"": 1,
\""CoolDown\"": 300
}
},
\""Trigger\"": {
\""CloudWatchAlarmDefinition\"": {
\""ComparisonOperator\"": \""LESS_THAN\"",
\""EvaluationPeriods\"": 1,
\""MetricName\"": \""YARNMemoryAvailablePercentage\"",
\""Namespace\"": \""AWS/ElasticMapReduce\"",
\""Period\"": 300,
\""Statistic\"": \""AVERAGE\"",
\""Threshold\"": 15.0,
\""Unit\"": \""PERCENT\""
}
}
}
]
}
",
},
EbsRootVolumeSize = 100,
Tags =
{
{ "role", "rolename" },
{ "env", "env" },
},
BootstrapActions = new[]
{
new Aws.Emr.Inputs.ClusterBootstrapActionArgs
{
Path = "s3://elasticmapreduce/bootstrap-actions/run-if",
Name = "runif",
Args = new[]
{
"instance.isMaster=true",
"echo running on master node",
},
},
},
ConfigurationsJson = @" [
{
\""Classification\"": \""hadoop-env\"",
\""Configurations\"": [
{
\""Classification\"": \""export\"",
\""Properties\"": {
\""JAVA_HOME\"": \""/usr/lib/jvm/java-1.8.0\""
}
}
],
\""Properties\"": {}
},
{
\""Classification\"": \""spark-env\"",
\""Configurations\"": [
{
\""Classification\"": \""export\"",
\""Properties\"": {
\""JAVA_HOME\"": \""/usr/lib/jvm/java-1.8.0\""
}
}
],
\""Properties\"": {}
}
]
",
ServiceRole = iamEmrServiceRole.Arn,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterEc2AttributesArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterBootstrapActionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var cluster = new Cluster("cluster", ClusterArgs.builder()
.name("emr-test-arn")
.releaseLabel("emr-4.6.0")
.applications("Spark")
.additionalInfo("""
{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
""")
.terminationProtection(false)
.keepJobFlowAliveWhenNoSteps(true)
.ec2Attributes(ClusterEc2AttributesArgs.builder()
.subnetId(main.id())
.emrManagedMasterSecurityGroup(sg.id())
.emrManagedSlaveSecurityGroup(sg.id())
.instanceProfile(emrProfile.arn())
.build())
.masterInstanceGroup(ClusterMasterInstanceGroupArgs.builder()
.instanceType("m4.large")
.build())
.coreInstanceGroup(ClusterCoreInstanceGroupArgs.builder()
.instanceType("c4.large")
.instanceCount(1)
.ebsConfigs(ClusterCoreInstanceGroupEbsConfigArgs.builder()
.size(40)
.type("gp2")
.volumesPerInstance(1)
.build())
.bidPrice("0.30")
.autoscalingPolicy("""
{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
""")
.build())
.ebsRootVolumeSize(100)
.tags(Map.ofEntries(
Map.entry("role", "rolename"),
Map.entry("env", "env")
))
.bootstrapActions(ClusterBootstrapActionArgs.builder()
.path("s3://elasticmapreduce/bootstrap-actions/run-if")
.name("runif")
.args(
"instance.isMaster=true",
"echo running on master node")
.build())
.configurationsJson("""
[
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
""")
.serviceRole(iamEmrServiceRole.arn())
.build());
}
}
resources:
cluster:
type: aws:emr:Cluster
properties:
name: emr-test-arn
releaseLabel: emr-4.6.0
applications:
- Spark
additionalInfo: |
{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
terminationProtection: false
keepJobFlowAliveWhenNoSteps: true
ec2Attributes:
subnetId: ${main.id}
emrManagedMasterSecurityGroup: ${sg.id}
emrManagedSlaveSecurityGroup: ${sg.id}
instanceProfile: ${emrProfile.arn}
masterInstanceGroup:
instanceType: m4.large
coreInstanceGroup:
instanceType: c4.large
instanceCount: 1
ebsConfigs:
- size: '40'
type: gp2
volumesPerInstance: 1
bidPrice: '0.30'
autoscalingPolicy: |
{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
ebsRootVolumeSize: 100
tags:
role: rolename
env: env
bootstrapActions:
- path: s3://elasticmapreduce/bootstrap-actions/run-if
name: runif
args:
- instance.isMaster=true
- echo running on master node
configurationsJson: |2
[
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
serviceRole: ${iamEmrServiceRole.arn}
The cluster launches with a master node and core nodes that scale based on YARN memory availability. The releaseLabel specifies the EMR version; applications lists the software to install (Spark in this case). The masterInstanceGroup and coreInstanceGroup define instance types and counts. The autoscalingPolicy on core nodes uses CloudWatch metrics to trigger scaling actions. The serviceRole grants EMR permissions to manage AWS resources; ec2Attributes places instances in your VPC with the specified security groups and instance profile.
Use instance fleets for flexible capacity
Instance fleets let EMR select from multiple instance types and purchase options to meet capacity targets, optimizing for cost and availability.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.emr.Cluster("example", {
masterInstanceFleet: {
instanceTypeConfigs: [{
instanceType: "m4.xlarge",
}],
targetOnDemandCapacity: 1,
},
coreInstanceFleet: {
instanceTypeConfigs: [
{
bidPriceAsPercentageOfOnDemandPrice: 80,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m3.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.2xlarge",
weightedCapacity: 2,
},
],
launchSpecifications: {
spotSpecifications: [{
allocationStrategy: "capacity-optimized",
blockDurationMinutes: 0,
timeoutAction: "SWITCH_TO_ON_DEMAND",
timeoutDurationMinutes: 10,
}],
},
name: "core fleet",
targetOnDemandCapacity: 2,
targetSpotCapacity: 2,
},
});
const task = new aws.emr.InstanceFleet("task", {
clusterId: example.id,
instanceTypeConfigs: [
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.2xlarge",
weightedCapacity: 2,
},
],
launchSpecifications: {
spotSpecifications: [{
allocationStrategy: "capacity-optimized",
blockDurationMinutes: 0,
timeoutAction: "TERMINATE_CLUSTER",
timeoutDurationMinutes: 10,
}],
},
name: "task fleet",
targetOnDemandCapacity: 1,
targetSpotCapacity: 1,
});
import pulumi
import pulumi_aws as aws
example = aws.emr.Cluster("example",
master_instance_fleet={
"instance_type_configs": [{
"instance_type": "m4.xlarge",
}],
"target_on_demand_capacity": 1,
},
core_instance_fleet={
"instance_type_configs": [
{
"bid_price_as_percentage_of_on_demand_price": 80,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m3.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.2xlarge",
"weighted_capacity": 2,
},
],
"launch_specifications": {
"spot_specifications": [{
"allocation_strategy": "capacity-optimized",
"block_duration_minutes": 0,
"timeout_action": "SWITCH_TO_ON_DEMAND",
"timeout_duration_minutes": 10,
}],
},
"name": "core fleet",
"target_on_demand_capacity": 2,
"target_spot_capacity": 2,
})
task = aws.emr.InstanceFleet("task",
cluster_id=example.id,
instance_type_configs=[
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.2xlarge",
"weighted_capacity": 2,
},
],
launch_specifications={
"spot_specifications": [{
"allocation_strategy": "capacity-optimized",
"block_duration_minutes": 0,
"timeout_action": "TERMINATE_CLUSTER",
"timeout_duration_minutes": 10,
}],
},
name="task fleet",
target_on_demand_capacity=1,
target_spot_capacity=1)
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
example, err := emr.NewCluster(ctx, "example", &emr.ClusterArgs{
MasterInstanceFleet: &emr.ClusterMasterInstanceFleetArgs{
InstanceTypeConfigs: emr.ClusterMasterInstanceFleetInstanceTypeConfigArray{
&emr.ClusterMasterInstanceFleetInstanceTypeConfigArgs{
InstanceType: pulumi.String("m4.xlarge"),
},
},
TargetOnDemandCapacity: pulumi.Int(1),
},
CoreInstanceFleet: &emr.ClusterCoreInstanceFleetArgs{
InstanceTypeConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(80),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m3.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.2xlarge"),
WeightedCapacity: pulumi.Int(2),
},
},
LaunchSpecifications: &emr.ClusterCoreInstanceFleetLaunchSpecificationsArgs{
SpotSpecifications: emr.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArray{
&emr.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs{
AllocationStrategy: pulumi.String("capacity-optimized"),
BlockDurationMinutes: pulumi.Int(0),
TimeoutAction: pulumi.String("SWITCH_TO_ON_DEMAND"),
TimeoutDurationMinutes: pulumi.Int(10),
},
},
},
Name: pulumi.String("core fleet"),
TargetOnDemandCapacity: pulumi.Int(2),
TargetSpotCapacity: pulumi.Int(2),
},
})
if err != nil {
return err
}
_, err = emr.NewInstanceFleet(ctx, "task", &emr.InstanceFleetArgs{
ClusterId: example.ID(),
InstanceTypeConfigs: emr.InstanceFleetInstanceTypeConfigArray{
&emr.InstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.InstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.InstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.InstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.InstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.InstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.2xlarge"),
WeightedCapacity: pulumi.Int(2),
},
},
LaunchSpecifications: &emr.InstanceFleetLaunchSpecificationsArgs{
SpotSpecifications: emr.InstanceFleetLaunchSpecificationsSpotSpecificationArray{
&emr.InstanceFleetLaunchSpecificationsSpotSpecificationArgs{
AllocationStrategy: pulumi.String("capacity-optimized"),
BlockDurationMinutes: pulumi.Int(0),
TimeoutAction: pulumi.String("TERMINATE_CLUSTER"),
TimeoutDurationMinutes: pulumi.Int(10),
},
},
},
Name: pulumi.String("task fleet"),
TargetOnDemandCapacity: pulumi.Int(1),
TargetSpotCapacity: pulumi.Int(1),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Emr.Cluster("example", new()
{
MasterInstanceFleet = new Aws.Emr.Inputs.ClusterMasterInstanceFleetArgs
{
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.ClusterMasterInstanceFleetInstanceTypeConfigArgs
{
InstanceType = "m4.xlarge",
},
},
TargetOnDemandCapacity = 1,
},
CoreInstanceFleet = new Aws.Emr.Inputs.ClusterCoreInstanceFleetArgs
{
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 80,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m3.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.2xlarge",
WeightedCapacity = 2,
},
},
LaunchSpecifications = new Aws.Emr.Inputs.ClusterCoreInstanceFleetLaunchSpecificationsArgs
{
SpotSpecifications = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs
{
AllocationStrategy = "capacity-optimized",
BlockDurationMinutes = 0,
TimeoutAction = "SWITCH_TO_ON_DEMAND",
TimeoutDurationMinutes = 10,
},
},
},
Name = "core fleet",
TargetOnDemandCapacity = 2,
TargetSpotCapacity = 2,
},
});
var task = new Aws.Emr.InstanceFleet("task", new()
{
ClusterId = example.Id,
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.2xlarge",
WeightedCapacity = 2,
},
},
LaunchSpecifications = new Aws.Emr.Inputs.InstanceFleetLaunchSpecificationsArgs
{
SpotSpecifications = new[]
{
new Aws.Emr.Inputs.InstanceFleetLaunchSpecificationsSpotSpecificationArgs
{
AllocationStrategy = "capacity-optimized",
BlockDurationMinutes = 0,
TimeoutAction = "TERMINATE_CLUSTER",
TimeoutDurationMinutes = 10,
},
},
},
Name = "task fleet",
TargetOnDemandCapacity = 1,
TargetSpotCapacity = 1,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceFleetArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceFleetArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceFleetLaunchSpecificationsArgs;
import com.pulumi.aws.emr.InstanceFleet;
import com.pulumi.aws.emr.InstanceFleetArgs;
import com.pulumi.aws.emr.inputs.InstanceFleetInstanceTypeConfigArgs;
import com.pulumi.aws.emr.inputs.InstanceFleetLaunchSpecificationsArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Cluster("example", ClusterArgs.builder()
.masterInstanceFleet(ClusterMasterInstanceFleetArgs.builder()
.instanceTypeConfigs(ClusterMasterInstanceFleetInstanceTypeConfigArgs.builder()
.instanceType("m4.xlarge")
.build())
.targetOnDemandCapacity(1)
.build())
.coreInstanceFleet(ClusterCoreInstanceFleetArgs.builder()
.instanceTypeConfigs(
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(80.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m3.xlarge")
.weightedCapacity(1)
.build(),
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.xlarge")
.weightedCapacity(1)
.build(),
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.2xlarge")
.weightedCapacity(2)
.build())
.launchSpecifications(ClusterCoreInstanceFleetLaunchSpecificationsArgs.builder()
.spotSpecifications(ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs.builder()
.allocationStrategy("capacity-optimized")
.blockDurationMinutes(0)
.timeoutAction("SWITCH_TO_ON_DEMAND")
.timeoutDurationMinutes(10)
.build())
.build())
.name("core fleet")
.targetOnDemandCapacity(2)
.targetSpotCapacity(2)
.build())
.build());
var task = new InstanceFleet("task", InstanceFleetArgs.builder()
.clusterId(example.id())
.instanceTypeConfigs(
InstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(InstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.xlarge")
.weightedCapacity(1)
.build(),
InstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(InstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.2xlarge")
.weightedCapacity(2)
.build())
.launchSpecifications(InstanceFleetLaunchSpecificationsArgs.builder()
.spotSpecifications(InstanceFleetLaunchSpecificationsSpotSpecificationArgs.builder()
.allocationStrategy("capacity-optimized")
.blockDurationMinutes(0)
.timeoutAction("TERMINATE_CLUSTER")
.timeoutDurationMinutes(10)
.build())
.build())
.name("task fleet")
.targetOnDemandCapacity(1)
.targetSpotCapacity(1)
.build());
}
}
resources:
example:
type: aws:emr:Cluster
properties:
masterInstanceFleet:
instanceTypeConfigs:
- instanceType: m4.xlarge
targetOnDemandCapacity: 1
coreInstanceFleet:
instanceTypeConfigs:
- bidPriceAsPercentageOfOnDemandPrice: 80
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m3.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.2xlarge
weightedCapacity: 2
launchSpecifications:
spotSpecifications:
- allocationStrategy: capacity-optimized
blockDurationMinutes: 0
timeoutAction: SWITCH_TO_ON_DEMAND
timeoutDurationMinutes: 10
name: core fleet
targetOnDemandCapacity: 2
targetSpotCapacity: 2
task:
type: aws:emr:InstanceFleet
properties:
clusterId: ${example.id}
instanceTypeConfigs:
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.2xlarge
weightedCapacity: 2
launchSpecifications:
spotSpecifications:
- allocationStrategy: capacity-optimized
blockDurationMinutes: 0
timeoutAction: TERMINATE_CLUSTER
timeoutDurationMinutes: 10
name: task fleet
targetOnDemandCapacity: 1
targetSpotCapacity: 1
Instance fleets replace instance groups with a more flexible model. The masterInstanceFleet and coreInstanceFleet define capacity targets rather than fixed instance counts. Each instanceTypeConfig specifies an instance type, EBS configuration, and bid price as a percentage of on-demand. The launchSpecifications control spot instance behavior: allocationStrategy determines how EMR selects instances, timeoutAction defines what happens if spot capacity isn’t available, and timeoutDurationMinutes sets the wait period before switching to on-demand.
Enable debug logging with a setup step
EMR implements debug logging as a cluster step that pushes state to S3 for troubleshooting.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.emr.Cluster("example", {steps: [{
actionOnFailure: "TERMINATE_CLUSTER",
name: "Setup Hadoop Debugging",
hadoopJarStep: {
jar: "command-runner.jar",
args: ["state-pusher-script"],
},
}]});
import pulumi
import pulumi_aws as aws
example = aws.emr.Cluster("example", steps=[{
"action_on_failure": "TERMINATE_CLUSTER",
"name": "Setup Hadoop Debugging",
"hadoop_jar_step": {
"jar": "command-runner.jar",
"args": ["state-pusher-script"],
},
}])
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := emr.NewCluster(ctx, "example", &emr.ClusterArgs{
Steps: emr.ClusterStepArray{
&emr.ClusterStepArgs{
ActionOnFailure: pulumi.String("TERMINATE_CLUSTER"),
Name: pulumi.String("Setup Hadoop Debugging"),
HadoopJarStep: &emr.ClusterStepHadoopJarStepArgs{
Jar: pulumi.String("command-runner.jar"),
Args: pulumi.StringArray{
pulumi.String("state-pusher-script"),
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Emr.Cluster("example", new()
{
Steps = new[]
{
new Aws.Emr.Inputs.ClusterStepArgs
{
ActionOnFailure = "TERMINATE_CLUSTER",
Name = "Setup Hadoop Debugging",
HadoopJarStep = new Aws.Emr.Inputs.ClusterStepHadoopJarStepArgs
{
Jar = "command-runner.jar",
Args = new[]
{
"state-pusher-script",
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterStepArgs;
import com.pulumi.aws.emr.inputs.ClusterStepHadoopJarStepArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Cluster("example", ClusterArgs.builder()
.steps(ClusterStepArgs.builder()
.actionOnFailure("TERMINATE_CLUSTER")
.name("Setup Hadoop Debugging")
.hadoopJarStep(ClusterStepHadoopJarStepArgs.builder()
.jar("command-runner.jar")
.args("state-pusher-script")
.build())
.build())
.build());
}
}
resources:
example:
type: aws:emr:Cluster
properties:
steps:
- actionOnFailure: TERMINATE_CLUSTER
name: Setup Hadoop Debugging
hadoopJarStep:
jar: command-runner.jar
args:
- state-pusher-script
Debug logging runs as the first step in your cluster. The hadoopJarStep uses command-runner.jar with the state-pusher-script argument to enable logging. The actionOnFailure property controls cluster behavior if the step fails; TERMINATE_CLUSTER stops the cluster immediately. Debug logs require an S3 bucket configured via the logUri property (not shown in this minimal example).
Deploy three master nodes for high availability
For production workloads, EMR supports clusters with three master nodes to eliminate single points of failure. This requires EMR 5.23.0 or later.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
const example = new aws.ec2.Subnet("example", {mapPublicIpOnLaunch: true});
const exampleCluster = new aws.emr.Cluster("example", {
releaseLabel: "emr-5.24.1",
terminationProtection: true,
ec2Attributes: {
subnetId: example.id,
},
masterInstanceGroup: {
instanceCount: 3,
},
coreInstanceGroup: {},
});
import pulumi
import pulumi_aws as aws
# This configuration is for illustrative purposes and highlights
# only relevant configurations for working with this functionality.
# Map public IP on launch must be enabled for public (Internet accessible) subnets
example = aws.ec2.Subnet("example", map_public_ip_on_launch=True)
example_cluster = aws.emr.Cluster("example",
release_label="emr-5.24.1",
termination_protection=True,
ec2_attributes={
"subnet_id": example.id,
},
master_instance_group={
"instance_count": 3,
},
core_instance_group={})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/ec2"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
example, err := ec2.NewSubnet(ctx, "example", &ec2.SubnetArgs{
MapPublicIpOnLaunch: pulumi.Bool(true),
})
if err != nil {
return err
}
_, err = emr.NewCluster(ctx, "example", &emr.ClusterArgs{
ReleaseLabel: pulumi.String("emr-5.24.1"),
TerminationProtection: pulumi.Bool(true),
Ec2Attributes: &emr.ClusterEc2AttributesArgs{
SubnetId: example.ID(),
},
MasterInstanceGroup: &emr.ClusterMasterInstanceGroupArgs{
InstanceCount: pulumi.Int(3),
},
CoreInstanceGroup: &emr.ClusterCoreInstanceGroupArgs{},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
var example = new Aws.Ec2.Subnet("example", new()
{
MapPublicIpOnLaunch = true,
});
var exampleCluster = new Aws.Emr.Cluster("example", new()
{
ReleaseLabel = "emr-5.24.1",
TerminationProtection = true,
Ec2Attributes = new Aws.Emr.Inputs.ClusterEc2AttributesArgs
{
SubnetId = example.Id,
},
MasterInstanceGroup = new Aws.Emr.Inputs.ClusterMasterInstanceGroupArgs
{
InstanceCount = 3,
},
CoreInstanceGroup = null,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.ec2.Subnet;
import com.pulumi.aws.ec2.SubnetArgs;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterEc2AttributesArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceGroupArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
var example = new Subnet("example", SubnetArgs.builder()
.mapPublicIpOnLaunch(true)
.build());
var exampleCluster = new Cluster("exampleCluster", ClusterArgs.builder()
.releaseLabel("emr-5.24.1")
.terminationProtection(true)
.ec2Attributes(ClusterEc2AttributesArgs.builder()
.subnetId(example.id())
.build())
.masterInstanceGroup(ClusterMasterInstanceGroupArgs.builder()
.instanceCount(3)
.build())
.coreInstanceGroup(ClusterCoreInstanceGroupArgs.builder()
.build())
.build());
}
}
resources:
# This configuration is for illustrative purposes and highlights
# only relevant configurations for working with this functionality.
# Map public IP on launch must be enabled for public (Internet accessible) subnets
example:
type: aws:ec2:Subnet
properties:
mapPublicIpOnLaunch: true
exampleCluster:
type: aws:emr:Cluster
name: example
properties:
releaseLabel: emr-5.24.1
terminationProtection: true
ec2Attributes:
subnetId: ${example.id}
masterInstanceGroup:
instanceCount: 3
coreInstanceGroup: {}
Setting instanceCount to 3 in the masterInstanceGroup creates a high-availability cluster. The releaseLabel must be 5.23.0 or later. The terminationProtection property prevents accidental deletion; you must set it to false before destroying the cluster. High-availability clusters require public subnets with mapPublicIpOnLaunch enabled for proper networking.
Beyond these examples
These snippets focus on specific cluster-level features: instance groups and fleets, autoscaling and spot instances, and debug logging and high availability. They’re intentionally minimal rather than full data processing deployments.
The examples reference pre-existing infrastructure such as VPC subnets, security groups, IAM roles, and S3 buckets for logs and bootstrap scripts. They focus on configuring the cluster rather than provisioning everything around it.
To keep things focused, common cluster patterns are omitted, including:
- Application configuration (configurationsJson)
- Bootstrap actions for custom setup
- Kerberos authentication (kerberosAttributes)
- Security configurations and encryption
- Step execution and job submission
These omissions are intentional: the goal is to illustrate how each cluster feature is wired, not provide drop-in data processing modules. See the EMR Cluster resource reference for all available configuration options.
Let's create AWS EMR Clusters
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Common Errors & Drift Issues
Several properties can cause perpetual differences:
- Kerberos configurations - The API doesn’t return these values after creation. Use
ignoreChangesforkerberosAttributes. - visibleToAllUsers - This deprecated argument causes drift if set to
false. Don’t set this property. - additionalInfo - No API exists to retrieve this value, so external changes aren’t detected.
- steps - If managing steps outside Pulumi, use
ignoreChangesfor thestepsproperty.
terminationProtection is enabled, you must first apply a configuration change setting it to false before destroying the cluster."Configurations": []. An empty array causes errors.IAM Roles & Security
EMR clusters require up to three IAM roles:
- Service role (required) - Set via
serviceRolefor EMR to access AWS resources. Default isEMR_DefaultRole. - Instance profile (required) - Set via
ec2Attributes.instanceProfilefor EC2 instances. The defaultEMR_EC2_DefaultRoleis deprecated; create your own. - Auto Scaling role (conditional) - Set via
autoscalingRoleif using automatic scaling policies.
Instance Configuration
You must choose one approach for master and core nodes:
- Instance Groups - Use
masterInstanceGroupandcoreInstanceGroupfor simpler, fixed configurations. - Instance Fleets - Use
masterInstanceFleetandcoreInstanceFleetfor advanced features like spot instances and weighted capacity.
These options are mutually exclusive; you cannot mix them for the same node type.
aws.emr.InstanceGroup or aws.emr.InstanceFleet. Reference your cluster via the clusterId property.masterInstanceGroup.instanceCount to 3 (available in EMR 5.23.0+). You must also enable terminationProtection and ensure mapPublicIpOnLaunch is true for public subnets.Cluster Lifecycle & Management
false, except when using multiple master nodes (then it defaults to true). Before destroying a cluster with termination protection enabled, you must apply a configuration change setting it to false.name, releaseLabel, serviceRole, applications, ec2Attributes, masterInstanceGroup, coreInstanceGroup, masterInstanceFleet, coreInstanceFleet, bootstrapActions, configurationsJson, customAmiId, ebsRootVolumeSize, kerberosAttributes, logUri, and steps. Changes to these require cluster replacement.stepConcurrencyLevel to set concurrent steps (default is 1, maximum is 256). This requires releaseLabel 5.28.0 or greater.hadoopJarStep using jar: "command-runner.jar" and args: ["state-pusher-script"]. If managing other steps externally, use ignoreChanges for the steps property to avoid drift.Using a different cloud?
Explore analytics guides for other cloud providers: