Skip to content

[BUG] Multimaster under TCP is not working #59053

Open
@s0undt3ch

Description

@s0undt3ch

Description
Multimaster, is not working with the TCP transport

Setup

Dockerfile
FROM saltstack/ci-centos-8

RUN yum install -y tmux
RUN python3 -m pip install salt
RUN mkdir -p /tmp/salt-tests-tmpdir/mm-master-1/conf \
      mkdir -p /tmp/salt-tests-tmpdir/mm-minion-1/conf \
      mkdir -p /tmp/salt-tests-tmpdir/mm-master-2/conf \
      mkdir -p /tmp/salt-tests-tmpdir/mm-minion-2/conf
RUN echo -en "api_logfile: logs/api.log\napi_pidfile: run/api.pid\ncachedir: cache\nenable_legacy_startup_events: false\nid: mm-master-1\ninterface: 127.0.0.1\nkey_logfile: logs/key.log\nlog_file: logs/master.log\nlog_fmt_console: '%(asctime)s,%(msecs)03.0f [%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_fmt_logfile: '[%(asctime)s,%(msecs)03.0f][%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_level_logfile: debug\nmax_open_files: 10240\nopen_mode: true\norder_masters: false\npidfile: run/master.pid\npillar_opts: false\npki_dir: pki\npublish_port: 55225\nret_port: 41309\nroot_dir: /tmp/salt-tests-tmpdir/mm-master-1\nsock_dir: run/master\ntcp_master_pub_port: 39653\ntcp_master_publish_pull: 33521\ntcp_master_pull_port: 48867\ntcp_master_workers: 52909\ntransport: zeromq\n" > /tmp/salt-tests-tmpdir/mm-master-1/conf/master

RUN echo -en "api_logfile: logs/api.log\napi_pidfile: run/api.pid\ncachedir: cache\nenable_legacy_startup_events: false\nid: mm-master-2\ninterface: 127.0.0.1\nkey_logfile: logs/key.log\nlog_file: logs/master.log\nlog_fmt_console: '%(asctime)s,%(msecs)03.0f [%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_fmt_logfile: '[%(asctime)s,%(msecs)03.0f][%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_level_logfile: debug\nmax_open_files: 10240\nopen_mode: true\norder_masters: false\npidfile: run/master.pid\npillar_opts: false\npki_dir: pki\npublish_port: 46579\nret_port: 54151\nroot_dir: /tmp/salt-tests-tmpdir/mm-master-2\nsock_dir: run/master\ntcp_master_pub_port: 33155\ntcp_master_publish_pull: 44571\ntcp_master_pull_port: 35349\ntcp_master_workers: 56335\ntransport: zeromq\n" > /tmp/salt-tests-tmpdir/mm-master-2/conf/master

RUN echo -en "cachedir: cache\nid: mm-minion-1\ninterface: 127.0.0.1\nlog_file: logs/minion.log\nlog_fmt_console: '%(asctime)s,%(msecs)03.0f [%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_fmt_logfile: '[%(asctime)s,%(msecs)03.0f][%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_level_logfile: debug\nloop_interval: 0.05\nmaster:\n- 127.0.0.1:41309\n- 127.0.0.1:54151\nmaster_port: 41309\npidfile: run/minion.pid\npki_dir: pki\nroot_dir: /tmp/salt-tests-tmpdir/mm-minion-1\nsock_dir: run/minion\ntcp_pub_port: 41831\ntcp_pull_port: 56859\ntransport: zeromq\n" > /tmp/salt-tests-tmpdir/mm-minion-1/conf/minion

RUN echo -en "cachedir: cache\nid: mm-minion-2\ninterface: 127.0.0.1\nlog_file: logs/minion.log\nlog_fmt_console: '%(asctime)s,%(msecs)03.0f [%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_fmt_logfile: '[%(asctime)s,%(msecs)03.0f][%(name)-17s:%(lineno)-4d][%(levelname)-8s][%(processName)18s(%(process)d)]\n  %(message)s'\nlog_level_logfile: debug\nloop_interval: 0.05\nmaster:\n- 127.0.0.1:41309\n- 127.0.0.1:54151\nmaster_port: 54151\npidfile: run/minion.pid\npki_dir: pki\nroot_dir: /tmp/salt-tests-tmpdir/mm-minion-2\nsock_dir: run/minion\ntcp_pub_port: 48801\ntcp_pull_port: 42185\ntransport: zeromq\n" > /tmp/salt-tests-tmpdir/mm-minion-2/conf/minion

RUN mkdir -p /tmp/salt-tests-tmpdir/scripts
RUN echo -e '#!/bin/bash\nsalt-master -c /tmp/salt-tests-tmpdir/mm-master-1/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-master-1.sh
RUN echo -e '#!/bin/bash\nsalt -c /tmp/salt-tests-tmpdir/mm-master-1/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-salt-1.sh
RUN echo -e '#!/bin/bash\nmkdir -p /tmp/salt-tests-tmpdir/mm-master-2/pki\ncp /tmp/salt-tests-tmpdir/mm-master-1/pki/master.* /tmp/salt-tests-tmpdir/mm-master-2/pki/\nsalt-master -c /tmp/salt-tests-tmpdir/mm-master-2/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-master-2.sh
RUN echo -e '#!/bin/bash\nsalt -c /tmp/salt-tests-tmpdir/mm-master-2/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-salt-2.sh
RUN echo -e '#!/bin/bash\nsalt-minion -c /tmp/salt-tests-tmpdir/mm-minion-1/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-minion-1.sh
RUN echo -e '#!/bin/bash\nsalt-minion -c /tmp/salt-tests-tmpdir/mm-minion-2/conf "${@:1}"' > /tmp/salt-tests-tmpdir/scripts/mm-minion-2.sh

RUN echo -en "#!/bin/sh\nPATH=\"/tmp/salt-tests-tmpdir/scripts:\$PATH\"\nexport PATH\n\ntmux new-session -s e -d -n mm-master-1 'bash -i'\ntmux new-window -t e:1 -n mm-master-2 'bash -i'\ntmux new-window -t e:2 -n mm-minion-1 'bash -i'\ntmux new-window -t e:3 -n mm-minion-2 'bash -i'\ntmux new-window -t e:4 -n shell-1   'bash -i'\n\ntmux select-window -t e:4\ntmux -2 attach-session -t e" > /tmp/salt-tests-tmpdir/scripts/start-tmux.sh
RUN echo -en "#!/bin/bash\nTRANSPORT=\$1\necho Setting transport to: \$TRANSPORT\nsed -i \"s/^transport: \\(.*\\)$/transport: \$TRANSPORT/g\" /tmp/salt-tests-tmpdir/mm-*/conf/{master,minion}" > /tmp/salt-tests-tmpdir/scripts/switch-transport.sh

RUN chmod +x /tmp/salt-tests-tmpdir/scripts/*.sh

ENTRYPOINT /tmp/salt-tests-tmpdir/scripts/start-tmux.sh

To simplify, you can use the s0undt3ch/multimaster-tcp docker container

Steps to Reproduce the behavior
Once you start the container, you'll have a tmux session open.

  • On window mm-master-1 run mm-master-1.sh
  • On window mm-master-2 run mm-master-2.sh
  • On window mm-minion-1 run mm-minion-1.sh
  • On window mm-minion-2 run mm-minion-2.sh

Now, on window shell-1, to verify it's working you can run mm-salt-1.sh \* test.ping

mm-minion-2:
    True
mm-minion-1:
    True

If you now stop the mm-master-2, the previous command should still work:

mm-minion-2:
    True
mm-minion-1:
    True

Now, to expose the problem, stop all daemons, run switch-transport.sh tcp and then start all daemons again.
On window shell-1, to verify it's working, run mm-salt-1.sh \* test.ping, both minions should return True to the ping:

mm-minion-2:
    True
mm-minion-1:
    True

However, if you stop mm-master-2 and run mm-salt-1.sh \* test.ping, no minions will respond:

mm-minion-2:
    Minion did not return. [No response]
    The minions may not have all finished running and any remaining minions will return upon completion. To look up the return data for this job later, run the following command:
    
    salt-run jobs.lookup_jid 20201201064144142153
mm-minion-1:
    Minion did not return. [No response]
    The minions may not have all finished running and any remaining minions will return upon completion. To look up the return data for this job later, run the following command:
    
    salt-run jobs.lookup_jid 20201201064144142153
ERROR: Minions returned with non-zero exit code

Expected behavior
Both minions should still respond when one of the masters is down, just like what happens when the zeromq transport is set.

Screenshots
If applicable, add screenshots to help explain your problem.

Versions Report

salt --versions-report (Provided by running salt --versions-report)
Salt Version:
          Salt: 3002.2
 
Dependency Versions:
          cffi: Not Installed
      cherrypy: Not Installed
      dateutil: Not Installed
     docker-py: Not Installed
         gitdb: Not Installed
     gitpython: Not Installed
        Jinja2: 2.11.2
       libgit2: Not Installed
      M2Crypto: Not Installed
          Mako: Not Installed
       msgpack: 1.0.0
  msgpack-pure: Not Installed
  mysql-python: Not Installed
     pycparser: Not Installed
      pycrypto: Not Installed
  pycryptodome: 3.9.9
        pygit2: Not Installed
        Python: 3.6.8 (default, Nov 21 2019, 19:31:34)
  python-gnupg: Not Installed
        PyYAML: 5.3.1
         PyZMQ: 20.0.0
         smmap: Not Installed
       timelib: Not Installed
       Tornado: 4.5.3
           ZMQ: 4.3.3
 
System Versions:
          dist: centos 8 Core
        locale: UTF-8
       machine: x86_64
       release: 5.9.10-arch1-1
        system: Linux
       version: CentOS Linux 8 Core

Metadata

Metadata

Assignees

No one assigned

    Labels

    Bugbroken, incorrect, or confusing behaviorseverity-high2nd top severity, seen by most users, causes major problems

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions