The aws:emr/cluster:Cluster resource, part of the Pulumi AWS provider, defines an EMR cluster: its release version, applications, node configuration, and scaling behavior. This guide focuses on four capabilities: instance groups and fleets, autoscaling policies, debug logging, and high-availability master nodes.
EMR clusters depend on IAM service roles, instance profiles, VPC infrastructure, and S3 buckets. The examples are intentionally small. Combine them with your own IAM roles, networking, and storage configuration.
Launch a cluster with instance groups and autoscaling
Most deployments start with a cluster running Spark or other applications on EC2 instances organized into master and core instance groups, with automatic scaling based on workload.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const cluster = new aws.emr.Cluster("cluster", {
name: "emr-test-arn",
releaseLabel: "emr-4.6.0",
applications: ["Spark"],
additionalInfo: `{
\\"instanceAwsClientConfiguration\\": {
\\"proxyPort\\": 8099,
\\"proxyHost\\": \\"myproxy.example.com\\"
}
}
`,
terminationProtection: false,
keepJobFlowAliveWhenNoSteps: true,
ec2Attributes: {
subnetId: main.id,
emrManagedMasterSecurityGroup: sg.id,
emrManagedSlaveSecurityGroup: sg.id,
instanceProfile: emrProfile.arn,
},
masterInstanceGroup: {
instanceType: "m4.large",
},
coreInstanceGroup: {
instanceType: "c4.large",
instanceCount: 1,
ebsConfigs: [{
size: 40,
type: "gp2",
volumesPerInstance: 1,
}],
bidPrice: "0.30",
autoscalingPolicy: `{
\\"Constraints\\": {
\\"MinCapacity\\": 1,
\\"MaxCapacity\\": 2
},
\\"Rules\\": [
{
\\"Name\\": \\"ScaleOutMemoryPercentage\\",
\\"Description\\": \\"Scale out if YARNMemoryAvailablePercentage is less than 15\\",
\\"Action\\": {
\\"SimpleScalingPolicyConfiguration\\": {
\\"AdjustmentType\\": \\"CHANGE_IN_CAPACITY\\",
\\"ScalingAdjustment\\": 1,
\\"CoolDown\\": 300
}
},
\\"Trigger\\": {
\\"CloudWatchAlarmDefinition\\": {
\\"ComparisonOperator\\": \\"LESS_THAN\\",
\\"EvaluationPeriods\\": 1,
\\"MetricName\\": \\"YARNMemoryAvailablePercentage\\",
\\"Namespace\\": \\"AWS/ElasticMapReduce\\",
\\"Period\\": 300,
\\"Statistic\\": \\"AVERAGE\\",
\\"Threshold\\": 15.0,
\\"Unit\\": \\"PERCENT\\"
}
}
}
]
}
`,
},
ebsRootVolumeSize: 100,
tags: {
role: "rolename",
env: "env",
},
bootstrapActions: [{
path: "s3://elasticmapreduce/bootstrap-actions/run-if",
name: "runif",
args: [
"instance.isMaster=true",
"echo running on master node",
],
}],
configurationsJson: ` [
{
\\"Classification\\": \\"hadoop-env\\",
\\"Configurations\\": [
{
\\"Classification\\": \\"export\\",
\\"Properties\\": {
\\"JAVA_HOME\\": \\"/usr/lib/jvm/java-1.8.0\\"
}
}
],
\\"Properties\\": {}
},
{
\\"Classification\\": \\"spark-env\\",
\\"Configurations\\": [
{
\\"Classification\\": \\"export\\",
\\"Properties\\": {
\\"JAVA_HOME\\": \\"/usr/lib/jvm/java-1.8.0\\"
}
}
],
\\"Properties\\": {}
}
]
`,
serviceRole: iamEmrServiceRole.arn,
});
import pulumi
import pulumi_aws as aws
cluster = aws.emr.Cluster("cluster",
name="emr-test-arn",
release_label="emr-4.6.0",
applications=["Spark"],
additional_info="""{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
""",
termination_protection=False,
keep_job_flow_alive_when_no_steps=True,
ec2_attributes={
"subnet_id": main["id"],
"emr_managed_master_security_group": sg["id"],
"emr_managed_slave_security_group": sg["id"],
"instance_profile": emr_profile["arn"],
},
master_instance_group={
"instance_type": "m4.large",
},
core_instance_group={
"instance_type": "c4.large",
"instance_count": 1,
"ebs_configs": [{
"size": 40,
"type": "gp2",
"volumes_per_instance": 1,
}],
"bid_price": "0.30",
"autoscaling_policy": """{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
""",
},
ebs_root_volume_size=100,
tags={
"role": "rolename",
"env": "env",
},
bootstrap_actions=[{
"path": "s3://elasticmapreduce/bootstrap-actions/run-if",
"name": "runif",
"args": [
"instance.isMaster=true",
"echo running on master node",
],
}],
configurations_json=""" [
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
""",
service_role=iam_emr_service_role["arn"])
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := emr.NewCluster(ctx, "cluster", &emr.ClusterArgs{
Name: pulumi.String("emr-test-arn"),
ReleaseLabel: pulumi.String("emr-4.6.0"),
Applications: pulumi.StringArray{
pulumi.String("Spark"),
},
AdditionalInfo: pulumi.String(`{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
`),
TerminationProtection: pulumi.Bool(false),
KeepJobFlowAliveWhenNoSteps: pulumi.Bool(true),
Ec2Attributes: &emr.ClusterEc2AttributesArgs{
SubnetId: pulumi.Any(main.Id),
EmrManagedMasterSecurityGroup: pulumi.Any(sg.Id),
EmrManagedSlaveSecurityGroup: pulumi.Any(sg.Id),
InstanceProfile: pulumi.Any(emrProfile.Arn),
},
MasterInstanceGroup: &emr.ClusterMasterInstanceGroupArgs{
InstanceType: pulumi.String("m4.large"),
},
CoreInstanceGroup: &emr.ClusterCoreInstanceGroupArgs{
InstanceType: pulumi.String("c4.large"),
InstanceCount: pulumi.Int(1),
EbsConfigs: emr.ClusterCoreInstanceGroupEbsConfigArray{
&emr.ClusterCoreInstanceGroupEbsConfigArgs{
Size: pulumi.Int(40),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
BidPrice: pulumi.String("0.30"),
AutoscalingPolicy: pulumi.String(`{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
`),
},
EbsRootVolumeSize: pulumi.Int(100),
Tags: pulumi.StringMap{
"role": pulumi.String("rolename"),
"env": pulumi.String("env"),
},
BootstrapActions: emr.ClusterBootstrapActionArray{
&emr.ClusterBootstrapActionArgs{
Path: pulumi.String("s3://elasticmapreduce/bootstrap-actions/run-if"),
Name: pulumi.String("runif"),
Args: pulumi.StringArray{
pulumi.String("instance.isMaster=true"),
pulumi.String("echo running on master node"),
},
},
},
ConfigurationsJson: pulumi.String(` [
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
`),
ServiceRole: pulumi.Any(iamEmrServiceRole.Arn),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var cluster = new Aws.Emr.Cluster("cluster", new()
{
Name = "emr-test-arn",
ReleaseLabel = "emr-4.6.0",
Applications = new[]
{
"Spark",
},
AdditionalInfo = @"{
\""instanceAwsClientConfiguration\"": {
\""proxyPort\"": 8099,
\""proxyHost\"": \""myproxy.example.com\""
}
}
",
TerminationProtection = false,
KeepJobFlowAliveWhenNoSteps = true,
Ec2Attributes = new Aws.Emr.Inputs.ClusterEc2AttributesArgs
{
SubnetId = main.Id,
EmrManagedMasterSecurityGroup = sg.Id,
EmrManagedSlaveSecurityGroup = sg.Id,
InstanceProfile = emrProfile.Arn,
},
MasterInstanceGroup = new Aws.Emr.Inputs.ClusterMasterInstanceGroupArgs
{
InstanceType = "m4.large",
},
CoreInstanceGroup = new Aws.Emr.Inputs.ClusterCoreInstanceGroupArgs
{
InstanceType = "c4.large",
InstanceCount = 1,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceGroupEbsConfigArgs
{
Size = 40,
Type = "gp2",
VolumesPerInstance = 1,
},
},
BidPrice = "0.30",
AutoscalingPolicy = @"{
\""Constraints\"": {
\""MinCapacity\"": 1,
\""MaxCapacity\"": 2
},
\""Rules\"": [
{
\""Name\"": \""ScaleOutMemoryPercentage\"",
\""Description\"": \""Scale out if YARNMemoryAvailablePercentage is less than 15\"",
\""Action\"": {
\""SimpleScalingPolicyConfiguration\"": {
\""AdjustmentType\"": \""CHANGE_IN_CAPACITY\"",
\""ScalingAdjustment\"": 1,
\""CoolDown\"": 300
}
},
\""Trigger\"": {
\""CloudWatchAlarmDefinition\"": {
\""ComparisonOperator\"": \""LESS_THAN\"",
\""EvaluationPeriods\"": 1,
\""MetricName\"": \""YARNMemoryAvailablePercentage\"",
\""Namespace\"": \""AWS/ElasticMapReduce\"",
\""Period\"": 300,
\""Statistic\"": \""AVERAGE\"",
\""Threshold\"": 15.0,
\""Unit\"": \""PERCENT\""
}
}
}
]
}
",
},
EbsRootVolumeSize = 100,
Tags =
{
{ "role", "rolename" },
{ "env", "env" },
},
BootstrapActions = new[]
{
new Aws.Emr.Inputs.ClusterBootstrapActionArgs
{
Path = "s3://elasticmapreduce/bootstrap-actions/run-if",
Name = "runif",
Args = new[]
{
"instance.isMaster=true",
"echo running on master node",
},
},
},
ConfigurationsJson = @" [
{
\""Classification\"": \""hadoop-env\"",
\""Configurations\"": [
{
\""Classification\"": \""export\"",
\""Properties\"": {
\""JAVA_HOME\"": \""/usr/lib/jvm/java-1.8.0\""
}
}
],
\""Properties\"": {}
},
{
\""Classification\"": \""spark-env\"",
\""Configurations\"": [
{
\""Classification\"": \""export\"",
\""Properties\"": {
\""JAVA_HOME\"": \""/usr/lib/jvm/java-1.8.0\""
}
}
],
\""Properties\"": {}
}
]
",
ServiceRole = iamEmrServiceRole.Arn,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterEc2AttributesArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterBootstrapActionArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var cluster = new Cluster("cluster", ClusterArgs.builder()
.name("emr-test-arn")
.releaseLabel("emr-4.6.0")
.applications("Spark")
.additionalInfo("""
{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
""")
.terminationProtection(false)
.keepJobFlowAliveWhenNoSteps(true)
.ec2Attributes(ClusterEc2AttributesArgs.builder()
.subnetId(main.id())
.emrManagedMasterSecurityGroup(sg.id())
.emrManagedSlaveSecurityGroup(sg.id())
.instanceProfile(emrProfile.arn())
.build())
.masterInstanceGroup(ClusterMasterInstanceGroupArgs.builder()
.instanceType("m4.large")
.build())
.coreInstanceGroup(ClusterCoreInstanceGroupArgs.builder()
.instanceType("c4.large")
.instanceCount(1)
.ebsConfigs(ClusterCoreInstanceGroupEbsConfigArgs.builder()
.size(40)
.type("gp2")
.volumesPerInstance(1)
.build())
.bidPrice("0.30")
.autoscalingPolicy("""
{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
""")
.build())
.ebsRootVolumeSize(100)
.tags(Map.ofEntries(
Map.entry("role", "rolename"),
Map.entry("env", "env")
))
.bootstrapActions(ClusterBootstrapActionArgs.builder()
.path("s3://elasticmapreduce/bootstrap-actions/run-if")
.name("runif")
.args(
"instance.isMaster=true",
"echo running on master node")
.build())
.configurationsJson("""
[
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
""")
.serviceRole(iamEmrServiceRole.arn())
.build());
}
}
resources:
cluster:
type: aws:emr:Cluster
properties:
name: emr-test-arn
releaseLabel: emr-4.6.0
applications:
- Spark
additionalInfo: |
{
\"instanceAwsClientConfiguration\": {
\"proxyPort\": 8099,
\"proxyHost\": \"myproxy.example.com\"
}
}
terminationProtection: false
keepJobFlowAliveWhenNoSteps: true
ec2Attributes:
subnetId: ${main.id}
emrManagedMasterSecurityGroup: ${sg.id}
emrManagedSlaveSecurityGroup: ${sg.id}
instanceProfile: ${emrProfile.arn}
masterInstanceGroup:
instanceType: m4.large
coreInstanceGroup:
instanceType: c4.large
instanceCount: 1
ebsConfigs:
- size: '40'
type: gp2
volumesPerInstance: 1
bidPrice: '0.30'
autoscalingPolicy: |
{
\"Constraints\": {
\"MinCapacity\": 1,
\"MaxCapacity\": 2
},
\"Rules\": [
{
\"Name\": \"ScaleOutMemoryPercentage\",
\"Description\": \"Scale out if YARNMemoryAvailablePercentage is less than 15\",
\"Action\": {
\"SimpleScalingPolicyConfiguration\": {
\"AdjustmentType\": \"CHANGE_IN_CAPACITY\",
\"ScalingAdjustment\": 1,
\"CoolDown\": 300
}
},
\"Trigger\": {
\"CloudWatchAlarmDefinition\": {
\"ComparisonOperator\": \"LESS_THAN\",
\"EvaluationPeriods\": 1,
\"MetricName\": \"YARNMemoryAvailablePercentage\",
\"Namespace\": \"AWS/ElasticMapReduce\",
\"Period\": 300,
\"Statistic\": \"AVERAGE\",
\"Threshold\": 15.0,
\"Unit\": \"PERCENT\"
}
}
}
]
}
ebsRootVolumeSize: 100
tags:
role: rolename
env: env
bootstrapActions:
- path: s3://elasticmapreduce/bootstrap-actions/run-if
name: runif
args:
- instance.isMaster=true
- echo running on master node
configurationsJson: |2
[
{
\"Classification\": \"hadoop-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
},
{
\"Classification\": \"spark-env\",
\"Configurations\": [
{
\"Classification\": \"export\",
\"Properties\": {
\"JAVA_HOME\": \"/usr/lib/jvm/java-1.8.0\"
}
}
],
\"Properties\": {}
}
]
serviceRole: ${iamEmrServiceRole.arn}
The cluster launches with the specified releaseLabel and applications. The masterInstanceGroup defines the master node type, while coreInstanceGroup configures worker nodes with EBS storage and autoscaling. The autoscalingPolicy uses a JSON document that defines CloudWatch metrics and scaling rules; here, the cluster adds capacity when YARN memory drops below 15%. The ec2Attributes block places instances in your VPC subnet with security groups and an instance profile for AWS service access.
Use instance fleets for flexible capacity
Instance fleets let you specify multiple instance types and purchase options, allowing EMR to optimize for cost and availability.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.emr.Cluster("example", {
masterInstanceFleet: {
instanceTypeConfigs: [{
instanceType: "m4.xlarge",
}],
targetOnDemandCapacity: 1,
},
coreInstanceFleet: {
instanceTypeConfigs: [
{
bidPriceAsPercentageOfOnDemandPrice: 80,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m3.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.2xlarge",
weightedCapacity: 2,
},
],
launchSpecifications: {
spotSpecifications: [{
allocationStrategy: "capacity-optimized",
blockDurationMinutes: 0,
timeoutAction: "SWITCH_TO_ON_DEMAND",
timeoutDurationMinutes: 10,
}],
},
name: "core fleet",
targetOnDemandCapacity: 2,
targetSpotCapacity: 2,
},
});
const task = new aws.emr.InstanceFleet("task", {
clusterId: example.id,
instanceTypeConfigs: [
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.xlarge",
weightedCapacity: 1,
},
{
bidPriceAsPercentageOfOnDemandPrice: 100,
ebsConfigs: [{
size: 100,
type: "gp2",
volumesPerInstance: 1,
}],
instanceType: "m4.2xlarge",
weightedCapacity: 2,
},
],
launchSpecifications: {
spotSpecifications: [{
allocationStrategy: "capacity-optimized",
blockDurationMinutes: 0,
timeoutAction: "TERMINATE_CLUSTER",
timeoutDurationMinutes: 10,
}],
},
name: "task fleet",
targetOnDemandCapacity: 1,
targetSpotCapacity: 1,
});
import pulumi
import pulumi_aws as aws
example = aws.emr.Cluster("example",
master_instance_fleet={
"instance_type_configs": [{
"instance_type": "m4.xlarge",
}],
"target_on_demand_capacity": 1,
},
core_instance_fleet={
"instance_type_configs": [
{
"bid_price_as_percentage_of_on_demand_price": 80,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m3.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.2xlarge",
"weighted_capacity": 2,
},
],
"launch_specifications": {
"spot_specifications": [{
"allocation_strategy": "capacity-optimized",
"block_duration_minutes": 0,
"timeout_action": "SWITCH_TO_ON_DEMAND",
"timeout_duration_minutes": 10,
}],
},
"name": "core fleet",
"target_on_demand_capacity": 2,
"target_spot_capacity": 2,
})
task = aws.emr.InstanceFleet("task",
cluster_id=example.id,
instance_type_configs=[
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.xlarge",
"weighted_capacity": 1,
},
{
"bid_price_as_percentage_of_on_demand_price": 100,
"ebs_configs": [{
"size": 100,
"type": "gp2",
"volumes_per_instance": 1,
}],
"instance_type": "m4.2xlarge",
"weighted_capacity": 2,
},
],
launch_specifications={
"spot_specifications": [{
"allocation_strategy": "capacity-optimized",
"block_duration_minutes": 0,
"timeout_action": "TERMINATE_CLUSTER",
"timeout_duration_minutes": 10,
}],
},
name="task fleet",
target_on_demand_capacity=1,
target_spot_capacity=1)
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
example, err := emr.NewCluster(ctx, "example", &emr.ClusterArgs{
MasterInstanceFleet: &emr.ClusterMasterInstanceFleetArgs{
InstanceTypeConfigs: emr.ClusterMasterInstanceFleetInstanceTypeConfigArray{
&emr.ClusterMasterInstanceFleetInstanceTypeConfigArgs{
InstanceType: pulumi.String("m4.xlarge"),
},
},
TargetOnDemandCapacity: pulumi.Int(1),
},
CoreInstanceFleet: &emr.ClusterCoreInstanceFleetArgs{
InstanceTypeConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(80),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m3.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.ClusterCoreInstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.2xlarge"),
WeightedCapacity: pulumi.Int(2),
},
},
LaunchSpecifications: &emr.ClusterCoreInstanceFleetLaunchSpecificationsArgs{
SpotSpecifications: emr.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArray{
&emr.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs{
AllocationStrategy: pulumi.String("capacity-optimized"),
BlockDurationMinutes: pulumi.Int(0),
TimeoutAction: pulumi.String("SWITCH_TO_ON_DEMAND"),
TimeoutDurationMinutes: pulumi.Int(10),
},
},
},
Name: pulumi.String("core fleet"),
TargetOnDemandCapacity: pulumi.Int(2),
TargetSpotCapacity: pulumi.Int(2),
},
})
if err != nil {
return err
}
_, err = emr.NewInstanceFleet(ctx, "task", &emr.InstanceFleetArgs{
ClusterId: example.ID(),
InstanceTypeConfigs: emr.InstanceFleetInstanceTypeConfigArray{
&emr.InstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.InstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.InstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.xlarge"),
WeightedCapacity: pulumi.Int(1),
},
&emr.InstanceFleetInstanceTypeConfigArgs{
BidPriceAsPercentageOfOnDemandPrice: pulumi.Float64(100),
EbsConfigs: emr.InstanceFleetInstanceTypeConfigEbsConfigArray{
&emr.InstanceFleetInstanceTypeConfigEbsConfigArgs{
Size: pulumi.Int(100),
Type: pulumi.String("gp2"),
VolumesPerInstance: pulumi.Int(1),
},
},
InstanceType: pulumi.String("m4.2xlarge"),
WeightedCapacity: pulumi.Int(2),
},
},
LaunchSpecifications: &emr.InstanceFleetLaunchSpecificationsArgs{
SpotSpecifications: emr.InstanceFleetLaunchSpecificationsSpotSpecificationArray{
&emr.InstanceFleetLaunchSpecificationsSpotSpecificationArgs{
AllocationStrategy: pulumi.String("capacity-optimized"),
BlockDurationMinutes: pulumi.Int(0),
TimeoutAction: pulumi.String("TERMINATE_CLUSTER"),
TimeoutDurationMinutes: pulumi.Int(10),
},
},
},
Name: pulumi.String("task fleet"),
TargetOnDemandCapacity: pulumi.Int(1),
TargetSpotCapacity: pulumi.Int(1),
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Emr.Cluster("example", new()
{
MasterInstanceFleet = new Aws.Emr.Inputs.ClusterMasterInstanceFleetArgs
{
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.ClusterMasterInstanceFleetInstanceTypeConfigArgs
{
InstanceType = "m4.xlarge",
},
},
TargetOnDemandCapacity = 1,
},
CoreInstanceFleet = new Aws.Emr.Inputs.ClusterCoreInstanceFleetArgs
{
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 80,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m3.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.2xlarge",
WeightedCapacity = 2,
},
},
LaunchSpecifications = new Aws.Emr.Inputs.ClusterCoreInstanceFleetLaunchSpecificationsArgs
{
SpotSpecifications = new[]
{
new Aws.Emr.Inputs.ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs
{
AllocationStrategy = "capacity-optimized",
BlockDurationMinutes = 0,
TimeoutAction = "SWITCH_TO_ON_DEMAND",
TimeoutDurationMinutes = 10,
},
},
},
Name = "core fleet",
TargetOnDemandCapacity = 2,
TargetSpotCapacity = 2,
},
});
var task = new Aws.Emr.InstanceFleet("task", new()
{
ClusterId = example.Id,
InstanceTypeConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.xlarge",
WeightedCapacity = 1,
},
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigArgs
{
BidPriceAsPercentageOfOnDemandPrice = 100,
EbsConfigs = new[]
{
new Aws.Emr.Inputs.InstanceFleetInstanceTypeConfigEbsConfigArgs
{
Size = 100,
Type = "gp2",
VolumesPerInstance = 1,
},
},
InstanceType = "m4.2xlarge",
WeightedCapacity = 2,
},
},
LaunchSpecifications = new Aws.Emr.Inputs.InstanceFleetLaunchSpecificationsArgs
{
SpotSpecifications = new[]
{
new Aws.Emr.Inputs.InstanceFleetLaunchSpecificationsSpotSpecificationArgs
{
AllocationStrategy = "capacity-optimized",
BlockDurationMinutes = 0,
TimeoutAction = "TERMINATE_CLUSTER",
TimeoutDurationMinutes = 10,
},
},
},
Name = "task fleet",
TargetOnDemandCapacity = 1,
TargetSpotCapacity = 1,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceFleetArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceFleetArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceFleetLaunchSpecificationsArgs;
import com.pulumi.aws.emr.InstanceFleet;
import com.pulumi.aws.emr.InstanceFleetArgs;
import com.pulumi.aws.emr.inputs.InstanceFleetInstanceTypeConfigArgs;
import com.pulumi.aws.emr.inputs.InstanceFleetLaunchSpecificationsArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Cluster("example", ClusterArgs.builder()
.masterInstanceFleet(ClusterMasterInstanceFleetArgs.builder()
.instanceTypeConfigs(ClusterMasterInstanceFleetInstanceTypeConfigArgs.builder()
.instanceType("m4.xlarge")
.build())
.targetOnDemandCapacity(1)
.build())
.coreInstanceFleet(ClusterCoreInstanceFleetArgs.builder()
.instanceTypeConfigs(
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(80.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m3.xlarge")
.weightedCapacity(1)
.build(),
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.xlarge")
.weightedCapacity(1)
.build(),
ClusterCoreInstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(ClusterCoreInstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.2xlarge")
.weightedCapacity(2)
.build())
.launchSpecifications(ClusterCoreInstanceFleetLaunchSpecificationsArgs.builder()
.spotSpecifications(ClusterCoreInstanceFleetLaunchSpecificationsSpotSpecificationArgs.builder()
.allocationStrategy("capacity-optimized")
.blockDurationMinutes(0)
.timeoutAction("SWITCH_TO_ON_DEMAND")
.timeoutDurationMinutes(10)
.build())
.build())
.name("core fleet")
.targetOnDemandCapacity(2)
.targetSpotCapacity(2)
.build())
.build());
var task = new InstanceFleet("task", InstanceFleetArgs.builder()
.clusterId(example.id())
.instanceTypeConfigs(
InstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(InstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.xlarge")
.weightedCapacity(1)
.build(),
InstanceFleetInstanceTypeConfigArgs.builder()
.bidPriceAsPercentageOfOnDemandPrice(100.0)
.ebsConfigs(InstanceFleetInstanceTypeConfigEbsConfigArgs.builder()
.size(100)
.type("gp2")
.volumesPerInstance(1)
.build())
.instanceType("m4.2xlarge")
.weightedCapacity(2)
.build())
.launchSpecifications(InstanceFleetLaunchSpecificationsArgs.builder()
.spotSpecifications(InstanceFleetLaunchSpecificationsSpotSpecificationArgs.builder()
.allocationStrategy("capacity-optimized")
.blockDurationMinutes(0)
.timeoutAction("TERMINATE_CLUSTER")
.timeoutDurationMinutes(10)
.build())
.build())
.name("task fleet")
.targetOnDemandCapacity(1)
.targetSpotCapacity(1)
.build());
}
}
resources:
example:
type: aws:emr:Cluster
properties:
masterInstanceFleet:
instanceTypeConfigs:
- instanceType: m4.xlarge
targetOnDemandCapacity: 1
coreInstanceFleet:
instanceTypeConfigs:
- bidPriceAsPercentageOfOnDemandPrice: 80
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m3.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.2xlarge
weightedCapacity: 2
launchSpecifications:
spotSpecifications:
- allocationStrategy: capacity-optimized
blockDurationMinutes: 0
timeoutAction: SWITCH_TO_ON_DEMAND
timeoutDurationMinutes: 10
name: core fleet
targetOnDemandCapacity: 2
targetSpotCapacity: 2
task:
type: aws:emr:InstanceFleet
properties:
clusterId: ${example.id}
instanceTypeConfigs:
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.xlarge
weightedCapacity: 1
- bidPriceAsPercentageOfOnDemandPrice: 100
ebsConfigs:
- size: 100
type: gp2
volumesPerInstance: 1
instanceType: m4.2xlarge
weightedCapacity: 2
launchSpecifications:
spotSpecifications:
- allocationStrategy: capacity-optimized
blockDurationMinutes: 0
timeoutAction: TERMINATE_CLUSTER
timeoutDurationMinutes: 10
name: task fleet
targetOnDemandCapacity: 1
targetSpotCapacity: 1
Instance fleets replace instance groups with a more flexible model. The instanceTypeConfigs array lists multiple instance types with weighted capacity, letting EMR choose the best mix. The launchSpecifications block configures spot instance behavior, including allocation strategy and timeout handling. Set targetOnDemandCapacity and targetSpotCapacity to control the mix of purchase options.
Enable debug logging with a setup step
EMR implements debug logging as a cluster step that configures log delivery to S3.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
const example = new aws.emr.Cluster("example", {steps: [{
actionOnFailure: "TERMINATE_CLUSTER",
name: "Setup Hadoop Debugging",
hadoopJarStep: {
jar: "command-runner.jar",
args: ["state-pusher-script"],
},
}]});
import pulumi
import pulumi_aws as aws
example = aws.emr.Cluster("example", steps=[{
"action_on_failure": "TERMINATE_CLUSTER",
"name": "Setup Hadoop Debugging",
"hadoop_jar_step": {
"jar": "command-runner.jar",
"args": ["state-pusher-script"],
},
}])
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
_, err := emr.NewCluster(ctx, "example", &emr.ClusterArgs{
Steps: emr.ClusterStepArray{
&emr.ClusterStepArgs{
ActionOnFailure: pulumi.String("TERMINATE_CLUSTER"),
Name: pulumi.String("Setup Hadoop Debugging"),
HadoopJarStep: &emr.ClusterStepHadoopJarStepArgs{
Jar: pulumi.String("command-runner.jar"),
Args: pulumi.StringArray{
pulumi.String("state-pusher-script"),
},
},
},
},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
var example = new Aws.Emr.Cluster("example", new()
{
Steps = new[]
{
new Aws.Emr.Inputs.ClusterStepArgs
{
ActionOnFailure = "TERMINATE_CLUSTER",
Name = "Setup Hadoop Debugging",
HadoopJarStep = new Aws.Emr.Inputs.ClusterStepHadoopJarStepArgs
{
Jar = "command-runner.jar",
Args = new[]
{
"state-pusher-script",
},
},
},
},
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterStepArgs;
import com.pulumi.aws.emr.inputs.ClusterStepHadoopJarStepArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
var example = new Cluster("example", ClusterArgs.builder()
.steps(ClusterStepArgs.builder()
.actionOnFailure("TERMINATE_CLUSTER")
.name("Setup Hadoop Debugging")
.hadoopJarStep(ClusterStepHadoopJarStepArgs.builder()
.jar("command-runner.jar")
.args("state-pusher-script")
.build())
.build())
.build());
}
}
resources:
example:
type: aws:emr:Cluster
properties:
steps:
- actionOnFailure: TERMINATE_CLUSTER
name: Setup Hadoop Debugging
hadoopJarStep:
jar: command-runner.jar
args:
- state-pusher-script
Debug logging runs as the first step in your cluster. The hadoopJarStep invokes command-runner.jar with the state-pusher-script argument, which enables detailed logging. Set actionOnFailure to TERMINATE_CLUSTER to stop the cluster if logging setup fails. This step requires the logUri property (not shown) to specify an S3 destination.
Deploy a high-availability cluster with three masters
For production workloads requiring fault tolerance, EMR supports clusters with three master nodes for automatic failover.
import * as pulumi from "@pulumi/pulumi";
import * as aws from "@pulumi/aws";
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
const example = new aws.ec2.Subnet("example", {mapPublicIpOnLaunch: true});
const exampleCluster = new aws.emr.Cluster("example", {
releaseLabel: "emr-5.24.1",
terminationProtection: true,
ec2Attributes: {
subnetId: example.id,
},
masterInstanceGroup: {
instanceCount: 3,
},
coreInstanceGroup: {},
});
import pulumi
import pulumi_aws as aws
# This configuration is for illustrative purposes and highlights
# only relevant configurations for working with this functionality.
# Map public IP on launch must be enabled for public (Internet accessible) subnets
example = aws.ec2.Subnet("example", map_public_ip_on_launch=True)
example_cluster = aws.emr.Cluster("example",
release_label="emr-5.24.1",
termination_protection=True,
ec2_attributes={
"subnet_id": example.id,
},
master_instance_group={
"instance_count": 3,
},
core_instance_group={})
package main
import (
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/ec2"
"github.com/pulumi/pulumi-aws/sdk/v7/go/aws/emr"
"github.com/pulumi/pulumi/sdk/v3/go/pulumi"
)
func main() {
pulumi.Run(func(ctx *pulumi.Context) error {
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
example, err := ec2.NewSubnet(ctx, "example", &ec2.SubnetArgs{
MapPublicIpOnLaunch: pulumi.Bool(true),
})
if err != nil {
return err
}
_, err = emr.NewCluster(ctx, "example", &emr.ClusterArgs{
ReleaseLabel: pulumi.String("emr-5.24.1"),
TerminationProtection: pulumi.Bool(true),
Ec2Attributes: &emr.ClusterEc2AttributesArgs{
SubnetId: example.ID(),
},
MasterInstanceGroup: &emr.ClusterMasterInstanceGroupArgs{
InstanceCount: pulumi.Int(3),
},
CoreInstanceGroup: &emr.ClusterCoreInstanceGroupArgs{},
})
if err != nil {
return err
}
return nil
})
}
using System.Collections.Generic;
using System.Linq;
using Pulumi;
using Aws = Pulumi.Aws;
return await Deployment.RunAsync(() =>
{
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
var example = new Aws.Ec2.Subnet("example", new()
{
MapPublicIpOnLaunch = true,
});
var exampleCluster = new Aws.Emr.Cluster("example", new()
{
ReleaseLabel = "emr-5.24.1",
TerminationProtection = true,
Ec2Attributes = new Aws.Emr.Inputs.ClusterEc2AttributesArgs
{
SubnetId = example.Id,
},
MasterInstanceGroup = new Aws.Emr.Inputs.ClusterMasterInstanceGroupArgs
{
InstanceCount = 3,
},
CoreInstanceGroup = null,
});
});
package generated_program;
import com.pulumi.Context;
import com.pulumi.Pulumi;
import com.pulumi.core.Output;
import com.pulumi.aws.ec2.Subnet;
import com.pulumi.aws.ec2.SubnetArgs;
import com.pulumi.aws.emr.Cluster;
import com.pulumi.aws.emr.ClusterArgs;
import com.pulumi.aws.emr.inputs.ClusterEc2AttributesArgs;
import com.pulumi.aws.emr.inputs.ClusterMasterInstanceGroupArgs;
import com.pulumi.aws.emr.inputs.ClusterCoreInstanceGroupArgs;
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.io.File;
import java.nio.file.Files;
import java.nio.file.Paths;
public class App {
public static void main(String[] args) {
Pulumi.run(App::stack);
}
public static void stack(Context ctx) {
// This configuration is for illustrative purposes and highlights
// only relevant configurations for working with this functionality.
// Map public IP on launch must be enabled for public (Internet accessible) subnets
var example = new Subnet("example", SubnetArgs.builder()
.mapPublicIpOnLaunch(true)
.build());
var exampleCluster = new Cluster("exampleCluster", ClusterArgs.builder()
.releaseLabel("emr-5.24.1")
.terminationProtection(true)
.ec2Attributes(ClusterEc2AttributesArgs.builder()
.subnetId(example.id())
.build())
.masterInstanceGroup(ClusterMasterInstanceGroupArgs.builder()
.instanceCount(3)
.build())
.coreInstanceGroup(ClusterCoreInstanceGroupArgs.builder()
.build())
.build());
}
}
resources:
# This configuration is for illustrative purposes and highlights
# only relevant configurations for working with this functionality.
# Map public IP on launch must be enabled for public (Internet accessible) subnets
example:
type: aws:ec2:Subnet
properties:
mapPublicIpOnLaunch: true
exampleCluster:
type: aws:emr:Cluster
name: example
properties:
releaseLabel: emr-5.24.1
terminationProtection: true
ec2Attributes:
subnetId: ${example.id}
masterInstanceGroup:
instanceCount: 3
coreInstanceGroup: {}
High-availability clusters require EMR 5.23.0 or later. Set instanceCount to 3 in the masterInstanceGroup to enable three-master mode. Enable terminationProtection to prevent accidental deletion. The subnet must have mapPublicIpOnLaunch enabled if the cluster needs Internet access for package downloads or external services.
Beyond these examples
These snippets focus on specific cluster-level features: instance groups and fleets with autoscaling, debug logging and step configuration, and high-availability master nodes. They’re intentionally minimal rather than full data processing pipelines.
The examples reference pre-existing infrastructure such as VPC subnets and security groups, IAM service roles and instance profiles, and S3 buckets for logs and bootstrap scripts. They focus on configuring the cluster rather than provisioning everything around it.
To keep things focused, common cluster patterns are omitted, including:
- Application-specific configurations (configurationsJson)
- Bootstrap actions for custom initialization
- Kerberos authentication (kerberosAttributes)
- Log encryption and security configurations
- Task node instance groups or fleets
- Custom AMIs and EBS root volume sizing
These omissions are intentional: the goal is to illustrate how each cluster feature is wired, not provide drop-in data processing solutions. See the EMR Cluster resource reference for all available configuration options.
Let's create AWS EMR Clusters
Get started with Pulumi Cloud, then follow our quick setup guide to deploy this infrastructure.
Try Pulumi Cloud for FREEFrequently Asked Questions
Configuration & Immutability
name, releaseLabel, serviceRole, masterInstanceGroup/masterInstanceFleet, coreInstanceGroup/coreInstanceFleet, keepJobFlowAliveWhenNoSteps, ec2Attributes, bootstrapActions, applications, configurationsJson, logUri, securityConfiguration, customAmiId, and logEncryptionKmsKeyId. Only terminationProtection, stepConcurrencyLevel, tags, and unhealthyNodeReplacement can be modified.terminationProtection to false and apply the configuration before attempting to destroy the cluster. Termination protection is enabled by default when using multiple master nodes.IAM & Permissions
serviceRole) for the EMR service to access AWS resources, and an instance profile role (in ec2Attributes.instanceProfile) for EC2 instances. An additional Auto Scaling role (in autoscalingRole) is needed if using automatic scaling. The default instance profile EMR_EC2_DefaultRole with policy AmazonElasticMapReduceforEC2Role is deprecated and you must create a custom instance profile to replace it.Drift Detection & Lifecycle
visibleToAllUsers argument is no longer supported by the AWS EMR API. Do not set this property, especially to false, as it will cause perpetual drift.ignoreChanges lifecycle option for kerberosAttributes if managing Kerberos settings outside Pulumi.ignoreChanges lifecycle option for the steps property to prevent Pulumi from detecting changes made to steps outside of your configuration.additionalInfo values after cluster creation, so Pulumi cannot detect if this property is changed outside your configuration.Instance Configuration
masterInstanceGroup.instanceCount to 3 and use releaseLabel 5.23.0 or later. Termination protection is automatically enabled with multiple masters. For public subnets, ensure mapPublicIpOnLaunch is enabled.aws.emr.InstanceGroup resource to configure task nodes. Task nodes cannot be configured directly in the cluster resource.Logging & Debugging
hadoopJarStep configured with jar: "command-runner.jar" and args: ["state-pusher-script"]. Use ignoreChanges for the steps property if managing steps outside Pulumi.Version Requirements & Compatibility
customAmiId) require 5.7.0+, security configurations (securityConfiguration) require 4.8.0+, multiple master nodes require 5.23.0+, concurrent steps (stepConcurrencyLevel) require 5.28.0+, and log encryption (logEncryptionKmsKeyId) requires 5.30.0+ (excluding 6.0.0).Configurations field entirely if it’s empty rather than providing an empty array like "Configurations": []. Empty arrays cause configuration errors.Using a different cloud?
Explore analytics guides for other cloud providers: