ec2_asg: fix #28087 and #35993 (#36679)

Fixes #35993 - Changes to update_size in commit eb4cc31 made it so the group dict passed into update_size was not modified. As a result, the 'replace' call does not see an updated min_size like it previously did and doesn't pause to wait for any new instances to spin up. Instead, it moves straight into terminating old instances. Fix is to add batch_size to min_size when calling wait_for_new_inst. Fixes #28087 - Make replace_all_instances and replace_instances behave exactly the same by setting replace_instances = current list of instances when replace_all_instances used. Root cause of issue was that without lc_check terminate_batch will terminate all instances passed to it and after updating the asg size we were querying the asg again for the list of instances - so terminate batch saw the list including new ones just spun up. When creating new asg with replace_all_instances: yes and lc_check: false the instances that are initially created are then subsequently replaced. This change makes it so replace only occurs if the asg already existed. Add integration tests for #28087 and #35993.
2025-07-22 12:50:22 -07:00 · 2018-03-05 10:47:31 -06:00 · 2018-03-05 10:47:31 -06:00 · a2b3120e85
commit a2b3120e85
parent ce416f247f
2 changed files with 132 additions and 2 deletions
--- a/test/integration/targets/ec2_asg/tasks/main.yml
+++ b/test/integration/targets/ec2_asg/tasks/main.yml
@ -387,6 +387,122 @@

    # ============================================================

+    # perform rolling replace with new launch configuration and lc_check:false
+
+    # Note - this is done async so we can query asg_facts during
+    # the execution. Issues #28087 and #35993 result in correct
+    # end result, but spin up extraneous instances during execution.
+    - name: "perform rolling update to new AMI with lc_check: false"
+      ec2_asg:
+        name: "{{ resource_prefix }}-asg"
+        launch_config_name: "{{ resource_prefix }}-lc-2"
+        health_check_type: EC2
+        desired_capacity: 3
+        min_size: 1
+        max_size: 5
+        health_check_period: 900
+        load_balancers: []
+        vpc_zone_identifier: "{{ testing_subnet.subnet.id }}"
+        wait_for_instances: yes
+        replace_all_instances: yes
+        replace_batch_size: 3
+        lc_check: false
+        wait_timeout: 1800
+        state: present
+        <<: *aws_connection_info
+      async: 1800
+      poll: 0
+      register: asg_job
+
+    - name: get ec2_asg facts for 3 minutes
+      ec2_asg_facts:
+        name: "{{ resource_prefix }}-asg"
+        <<: *aws_connection_info
+      register: output
+      loop_control:
+          pause: 15
+      with_sequence: count=12
+
+    - set_fact:
+        inst_id_json_query: 'results[*].results[*].instances[*].instance_id'
+
+    # Since we started with 3 servers and replace all of them.
+    # We should see 6 servers total.
+    - assert:
+        that:
+          - "lookup('flattened',output|json_query(inst_id_json_query)).split(',')|unique|length == 6"
+
+    - name: Ensure ec2_asg task completes
+      async_status: jid="{{ asg_job.ansible_job_id }}"
+      register: status
+      until: status.finished
+      retries: 200
+      delay: 15
+
+    # ============================================================
+
+    - name: kill asg
+      ec2_asg:
+        name: "{{ resource_prefix }}-asg"
+        state: absent
+        <<: *aws_connection_info
+      async: 300
+
+    # Create new asg with replace_all_instances and lc_check:false
+
+    # Note - this is done async so we can query asg_facts during
+    # the execution. Issues #28087 results in correct
+    # end result, but spin up extraneous instances during execution.
+    - name: "new asg with lc_check: false"
+      ec2_asg:
+        name: "{{ resource_prefix }}-asg"
+        launch_config_name: "{{ resource_prefix }}-lc"
+        health_check_type: EC2
+        desired_capacity: 3
+        min_size: 1
+        max_size: 5
+        health_check_period: 900
+        load_balancers: []
+        vpc_zone_identifier: "{{ testing_subnet.subnet.id }}"
+        wait_for_instances: yes
+        replace_all_instances: yes
+        replace_batch_size: 3
+        lc_check: false
+        wait_timeout: 1800
+        state: present
+        <<: *aws_connection_info
+      async: 1800
+      poll: 0
+      register: asg_job
+
+    # Collect ec2_asg_facts for 3 minutes
+    - name: get ec2_asg facts
+      ec2_asg_facts:
+        name: "{{ resource_prefix }}-asg"
+        <<: *aws_connection_info
+      register: output
+      loop_control:
+          pause: 15
+      with_sequence: count=12
+
+    - set_fact:
+        inst_id_json_query: 'results[*].results[*].instances[*].instance_id'
+
+    # Get all instance_ids we saw and assert we saw number expected
+    # Should only see 3 (don't replace instances we just created)
+    - assert:
+        that:
+          - "lookup('flattened',output|json_query(inst_id_json_query)).split(',')|unique|length == 3"
+
+    - name: Ensure ec2_asg task completes
+      async_status: jid="{{ asg_job.ansible_job_id }}"
+      register: status
+      until: status.finished
+      retries: 200
+      delay: 15
+
+# ============================================================
+
  always:

    - name: kill asg