From 5640a15df50a935ef2dc080351a2f1e0106c8bf6 Mon Sep 17 00:00:00 2001
From: Sophist <3001893+Sophist-UK@users.noreply.github.com>
Date: Sat, 26 Mar 2022 21:19:38 +0000
Subject: [PATCH 1/2] Updated ccbench.py used to assess multithreading

As used in https://github.com/faster-cpython/ideas/discussions/328.

Guido asked me to submit my changes as a PR so that the enhancements would be available to others.
---
 Tools/ccbench/ccbench.py | 506 +++++++++++++++++++++++++++++----------
 1 file changed, 382 insertions(+), 124 deletions(-)

diff --git a/Tools/ccbench/ccbench.py b/Tools/ccbench/ccbench.py
index d52701a82948da..3c4274cdd62804 100644
--- a/Tools/ccbench/ccbench.py
+++ b/Tools/ccbench/ccbench.py
@@ -11,10 +11,12 @@
 import os
 import sys
 import itertools
+import multiprocessing
 import threading
 import subprocess
 import socket
 from optparse import OptionParser, SUPPRESS_HELP
+from statistics import fmean, pstdev
 import platform
 
 # Compatibility
@@ -28,16 +30,69 @@
 except AttributeError:
     pass
 
+# psutil
+    try:
+        import psutil
+    except ImportError:
+        psutil = None
+
+
+
+PYTHON_VERSION = tuple(map(int,platform.python_version_tuple()))
+# Add the first two parts of the version number as a tuple for any version of Python
+# where the tests hang (because the same thread monopolises the GIL)
+YIELD_VERSIONS = [
+    # e.g. (2, 7)
+]
+
+CORES = multiprocessing.cpu_count()
 
 THROUGHPUT_DURATION = 2.0
 
-LATENCY_PING_INTERVAL = 0.1
 LATENCY_DURATION = 2.0
+LATENCY_PING_INTERVAL = 0.1
+LATENCY_PORT = 16000
 
-BANDWIDTH_PACKET_SIZE = 1024
 BANDWIDTH_DURATION = 2.0
+BANDWIDTH_ECHO_INTERVAL = 0.0
+BANDWIDTH_PORT = 16001
+BANDWIDTH_PACKET_SIZE = 1024
+
+for mod in 'bz2', 'hashlib':
+    try:
+        globals()[mod] = __import__(mod)
+    except ImportError:
+        globals()[mod] = None
 
 
+def cpu_times():
+    if psutil is None:
+        times = os.times()
+        return times.user, times.system, None
+    times = psutil.Process().cpu_times()
+    iowait = times.iowait if len(times) > 4 else None
+    return times.user, times.system, iowait
+
+def cpu_delta(start, finish):
+    user = finish[0] - start[0]
+    system = finish[1] - start[1]
+    total = user + system
+    iowait = finish[2] - start[2] if start[2] is not None and finish[2] is not None else None
+    return total, user, system, iowait
+
+def calc_cpu_usage(start, finish, baseline=None):
+    times = []
+    delta = cpu_delta(start, finish)
+    if baseline is None:
+        baseline = (None,) * 4
+    for type, cpu, base in zip(("Total", "User", "System", "IoWait"), delta, baseline):
+        if cpu is not None:
+            if base:
+                times.append("%s: %7.4f (%+6.1f%%)" % (type, cpu, (cpu / base - 1.0) * 100))
+            else:
+                times.append("%s: %7.4f" % (type, cpu))
+    return "CPU: %s" % ", ".join(times)
+
 def task_pidigits():
     """Pi calculation (Python)"""
     _map = map
@@ -94,56 +149,56 @@ def list_sort(l):
 
     return list_sort, (list(range(1000)), )
 
-def task_compress_zlib():
-    """zlib compression (C)"""
-    import zlib
-    with open(__file__, "rb") as f:
-        arg = f.read(5000) * 3
-
-    def compress(s):
-        zlib.decompress(zlib.compress(s, 5))
-    return compress, (arg, )
-
-def task_compress_bz2():
-    """bz2 compression (C)"""
-    import bz2
-    with open(__file__, "rb") as f:
-        arg = f.read(3000) * 2
+throughput_tasks = [task_pidigits, task_regex]
 
-    def compress(s):
-        bz2.compress(s)
-    return compress, (arg, )
+# For whatever reasons, zlib gives irregular results,
+# so we only run zlib if bz2 and hashlib are not available.
+# (NOTE: hashlib releases the GIL from 2.7 and 3.1 onwards)
+if bz2 is not None:
+    def task_compress_bz2():
+        """bz2 compression (C)"""
+        with open(__file__, "rb") as f:
+            arg = f.read(3000) * 2
+
+        def compress(s):
+            bz2.compress(s)
+        return compress, (arg, )
+    throughput_tasks.append(task_compress_bz2)
 
-def task_hashing():
-    """SHA1 hashing (C)"""
-    import hashlib
+if hashlib is not None:
     with open(__file__, "rb") as f:
         arg = f.read(5000) * 30
 
-    def compute(s):
-        hashlib.sha1(s).digest()
-    return compute, (arg, )
-
-
-throughput_tasks = [task_pidigits, task_regex]
-for mod in 'bz2', 'hashlib':
-    try:
-        globals()[mod] = __import__(mod)
-    except ImportError:
-        globals()[mod] = None
+    def task_hashing_sha1():
+        """SHA1 hashing (C)"""
+        def compute(s):
+            hashlib.sha1(s).digest()
+        return compute, (arg, )
+    throughput_tasks.append(task_hashing_sha1)
+
+    def task_hashing_sha512():
+        """SHA512 hashing (C) - GIL is released"""
+        def compute(s):
+            hashlib.sha512(s).digest()
+        return compute, (arg, )
+    throughput_tasks.append(task_hashing_sha512)
+
+if bz2 is None and hashlib is None:
+    def task_compress_zlib():
+        """zlib compression (C)"""
+        import zlib
+        with open(__file__, "rb") as f:
+            arg = f.read(5000) * 3
+
+        def compress(s):
+            zlib.decompress(zlib.compress(s, 5))
+        return compress, (arg, )
 
-# For whatever reasons, zlib gives irregular results, so we prefer bz2 or
-# hashlib if available.
-# (NOTE: hashlib releases the GIL from 2.7 and 3.1 onwards)
-if bz2 is not None:
-    throughput_tasks.append(task_compress_bz2)
-elif hashlib is not None:
-    throughput_tasks.append(task_hashing)
-else:
     throughput_tasks.append(task_compress_zlib)
 
 latency_tasks = throughput_tasks
 bandwidth_tasks = [task_pidigits]
+bandwidth_tasks = throughput_tasks
 
 
 class TimedLoop:
@@ -159,7 +214,7 @@ def __call__(self, start_time, min_duration, end_event, do_yield=False):
         _sleep = time.sleep
         _func = self.func
         _args = self.args
-        t1 = start_time
+        t1 = _time()
         while True:
             for i in range(step):
                 _func(*_args)
@@ -167,16 +222,16 @@ def __call__(self, start_time, min_duration, end_event, do_yield=False):
             # If another thread terminated, the current measurement is invalid
             # => return the previous one.
             if end_event:
-                return niters, duration
+                return float(niters)/float(duration) if duration > 0 else 0.0
             niters += step
             duration = t2 - start_time
             if duration >= min_duration:
                 end_event.append(None)
-                return niters, duration
+                return float(niters)/float(duration) if duration > 0 else 0.0
             if t2 - t1 < 0.01:
                 # Minimize interference of measurement on overall runtime
                 step = step * 3 // 2
-            elif do_yield:
+            elif do_yield and PYTHON_VERSION[:2] in YIELD_VERSIONS:
                 # OS scheduling of Python threads is sometimes so bad that we
                 # have to force thread switching ourselves, otherwise we get
                 # completely useless results.
@@ -198,8 +253,8 @@ def run_throughput_test(func, args, nthreads):
         # Pure single-threaded performance, without any switching or
         # synchronization overhead.
         start_time = time.time()
-        results.append(loop(start_time, THROUGHPUT_DURATION,
-                            end_event, do_yield=False))
+
+        results.append(loop(start_time, THROUGHPUT_DURATION, end_event))
         return results
 
     started = False
@@ -243,18 +298,26 @@ def run_throughput_tests(max_threads):
         print()
         func, args = task()
         nthreads = 1
-        baseline_speed = None
+        baseline_speed = baseline_cpu = None
         while nthreads <= max_threads:
+            start_cpu = cpu_times()
             results = run_throughput_test(func, args, nthreads)
+            finish_cpu = cpu_times()
             # Taking the max duration rather than average gives pessimistic
             # results rather than optimistic.
-            speed = sum(r[0] for r in results) / max(r[1] for r in results)
-            print("threads=%d: %d" % (nthreads, speed), end="")
+            speed = round(sum(results))
+
+            print("threads=%2d: %4d" % (nthreads, speed), end="")
             if baseline_speed is None:
-                print(" iterations/s.")
+                print(" iterations/sec        - Baseline: ", end="")
                 baseline_speed = speed
             else:
-                print(" ( %d %%)" % (speed / baseline_speed * 100))
+                ratio = (speed / baseline_speed - 1.0) * 100
+                stdev = pstdev(results)
+                print(" (%+6.1f%%, std dev: %3d its/sec) - " % (ratio, stdev), end="")
+            print(calc_cpu_usage(start_cpu, finish_cpu, baseline_cpu))
+            if baseline_cpu is None:
+                baseline_cpu = cpu_delta(start_cpu, finish_cpu)
             nthreads += 1
         print()
 
@@ -267,7 +330,15 @@ def _sendto(sock, s, addr):
 def _recv(sock, n):
     return sock.recv(n).decode('ascii')
 
-def latency_client(addr, nb_pings, interval):
+def latency_client(
+    addr="127.0.0.1",
+    duration=LATENCY_DURATION,
+    interval=LATENCY_PING_INTERVAL,
+    port = LATENCY_PORT,
+):
+    nb_pings = int(duration / interval)
+    if isinstance(addr, str):
+        addr = (addr, port)
     sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     try:
         _time = time.time
@@ -285,25 +356,26 @@ def _ping():
     finally:
         sock.close()
 
-def run_latency_client(**kwargs):
+def run_latency_client(*args):
     cmd_line = [sys.executable, '-E', os.path.abspath(__file__)]
-    cmd_line.extend(['--latclient', repr(kwargs)])
+    cmd_line.extend(['--latclient', str(args)])
+    print(" ".join(cmd_line))
     return subprocess.Popen(cmd_line) #, stdin=subprocess.PIPE,
                             #stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
-def run_latency_test(func, args, nthreads):
+def run_latency_test(func, args, nthreads, interval):
     # Create a listening socket to receive the pings. We use UDP which should
     # be painlessly cross-platform.
     sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     sock.bind(("127.0.0.1", 0))
     addr = sock.getsockname()
 
-    interval = LATENCY_PING_INTERVAL
     duration = LATENCY_DURATION
     nb_pings = int(duration / interval)
 
     results = []
     threads = []
+    thread_results = []
     end_event = []
     start_cond = threading.Condition()
     started = False
@@ -323,7 +395,7 @@ def run():
             with start_cond:
                 while not started:
                     start_cond.wait()
-            loop(start_time, duration * 1.5, end_event, do_yield=False)
+            thread_results.append(loop(start_time, duration * 1.5, end_event))
 
         for i in range(nthreads):
             threads.append(threading.Thread(target=run))
@@ -338,8 +410,7 @@ def run():
     # Run the client and wait for the first ping(s) to arrive before
     # unblocking the background threads.
     chunks = []
-    process = run_latency_client(addr=sock.getsockname(),
-                                 nb_pings=nb_pings, interval=interval)
+    process = run_latency_client(sock.getsockname(), options.client_delay)
     s = _recv(sock, 4096)
     _time = time.time
 
@@ -370,32 +441,43 @@ def run():
                 assert isinstance(send_time, float)
                 results.append((send_time, recv_time))
 
-    return results
+    return results, thread_results
 
-def run_latency_tests(max_threads):
+def run_latency_tests(max_threads, interval):
     for task in latency_tasks:
         print("Background CPU task:", task.__doc__)
         print()
         func, args = task()
         nthreads = 0
+        baseline_cpu = None
         while nthreads <= max_threads:
-            results = run_latency_test(func, args, nthreads)
-            n = len(results)
+            start_cpu = cpu_times()
+            results, thread_results = run_latency_test(func, args, nthreads, interval)
+            throughput = sum(thread_results)
+            finish_cpu = cpu_times()
             # We print out milliseconds
             lats = [1000 * (t2 - t1) for (t1, t2) in results]
             #print(list(map(int, lats)))
-            avg = sum(lats) / n
-            dev = (sum((x - avg) ** 2 for x in lats) / n) ** 0.5
-            print("CPU threads=%d: %d ms. (std dev: %d ms.)" % (nthreads, avg, dev), end="")
+            avg = fmean(lats)
+            stdev = pstdev(lats)
+            print("CPU threads=%2d: %3d ms (std dev: %3d ms)" % (nthreads, avg, stdev), end="")
+            if nthreads > 0:
+                print(" - Throughput: %4d its/sec - %s"
+                    % ( throughput, calc_cpu_usage(start_cpu, finish_cpu, baseline_cpu)),
+                    end=""
+                )
+                if baseline_cpu is None:
+                    baseline_cpu = cpu_delta(start_cpu, finish_cpu)
             print()
-            #print("    [... from %d samples]" % n)
             nthreads += 1
         print()
 
 
 BW_END = "END"
 
-def bandwidth_client(addr, packet_size, duration):
+def bandwidth_client(addr, packet_size, duration, interval, port=BANDWIDTH_PORT):
+    if isinstance(addr, str):
+        addr = (addr, port)
     sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
     sock.bind(("127.0.0.1", 0))
     local_addr = sock.getsockname()
@@ -413,18 +495,20 @@ def _send_chunk(msg):
             _send_chunk(str(i))
             s = _recv(sock, packet_size)
             assert len(s) == packet_size
+            _sleep(interval)
             i += 1
         _send_chunk(BW_END)
     finally:
         sock.close()
 
-def run_bandwidth_client(**kwargs):
+def run_bandwidth_client(*args):
     cmd_line = [sys.executable, '-E', os.path.abspath(__file__)]
-    cmd_line.extend(['--bwclient', repr(kwargs)])
+    cmd_line.extend(['--bwclient', str(args)])
+    # print(" ".join(cmd_line))
     return subprocess.Popen(cmd_line) #, stdin=subprocess.PIPE,
                             #stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
 
-def run_bandwidth_test(func, args, nthreads):
+def run_bandwidth_test(func, args, nthreads, interval):
     # Create a listening socket to receive the packets. We use UDP which should
     # be painlessly cross-platform.
     with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as sock:
@@ -436,6 +520,7 @@ def run_bandwidth_test(func, args, nthreads):
 
         results = []
         threads = []
+        thread_results = []
         end_event = []
         start_cond = threading.Condition()
         started = False
@@ -443,7 +528,6 @@ def run_bandwidth_test(func, args, nthreads):
             # Warm up
             func(*args)
 
-            results = []
             loop = TimedLoop(func, args)
             ready = []
             ready_cond = threading.Condition()
@@ -455,7 +539,7 @@ def run():
                 with start_cond:
                     while not started:
                         start_cond.wait()
-                loop(start_time, duration * 1.5, end_event, do_yield=False)
+                thread_results.append(loop(start_time, duration * 1.5, end_event))
 
             for i in range(nthreads):
                 threads.append(threading.Thread(target=run))
@@ -469,9 +553,7 @@ def run():
 
         # Run the client and wait for the first packet to arrive before
         # unblocking the background threads.
-        process = run_bandwidth_client(addr=addr,
-                                       packet_size=packet_size,
-                                       duration=duration)
+        process = run_bandwidth_client(addr, packet_size, duration, interval)
         _time = time.time
         # This will also wait for the parent to be ready
         s = _recv(sock, packet_size)
@@ -482,13 +564,15 @@ def run():
             started = True
             start_cond.notify(nthreads)
 
-        n = 0
+        n = -1
         first_time = None
-        while not end_event and BW_END not in s:
+        while not end_event:
             _sendto(sock, s, remote_addr)
             s = _recv(sock, packet_size)
             if first_time is None:
                 first_time = _time()
+            if BW_END in s:
+                break
             n += 1
         end_time = _time()
 
@@ -497,52 +581,220 @@ def run():
         t.join()
     process.kill()
 
-    return (n - 1) / (end_time - first_time)
+    speed = n / (end_time - first_time) if end_time != first_time else 0.0
+
+    return speed, sum(thread_results)
 
-def run_bandwidth_tests(max_threads):
+def run_bandwidth_tests(max_threads, interval):
     for task in bandwidth_tasks:
         print("Background CPU task:", task.__doc__)
         print()
         func, args = task()
         nthreads = 0
-        baseline_speed = None
+        baseline_speed = baseline_cpu = None
         while nthreads <= max_threads:
-            results = run_bandwidth_test(func, args, nthreads)
-            speed = results
+            start_cpu = cpu_times()
+            speed, throughput = run_bandwidth_test(func, args, nthreads, interval)
+            finish_cpu = cpu_times()
             #speed = len(results) * 1.0 / results[-1][0]
-            print("CPU threads=%d: %.1f" % (nthreads, speed), end="")
-            if baseline_speed is None:
-                print(" packets/s.")
+            print("CPU threads=%2d: %6.1f" % (nthreads, speed), end="")
+            if nthreads == 0:
+                print(" packets/sec")
                 baseline_speed = speed
             else:
-                print(" ( %d %%)" % (speed / baseline_speed * 100))
+                delta = (speed / baseline_speed - 1) * 100
+                print(" pkt/sec (%+6.1f%%) - Throughput: %4d its/sec - %s"
+                    % (delta, throughput, calc_cpu_usage(start_cpu, finish_cpu, baseline_cpu))
+                )
+                if baseline_cpu is None and nthreads > 0:
+                    baseline_cpu = cpu_delta(start_cpu, finish_cpu)
             nthreads += 1
         print()
 
+def print_env(*args, **kwargs):
+    print("== %s %s (%s) %d-bit ==" % (
+        platform.python_implementation(),
+        platform.python_version(),
+        platform.python_build()[0],
+        (len(hex(sys.maxsize)) - 2) * 4,
+    ))
+    # Processor identification often has repeated spaces
+    cpu = ' '.join(platform.processor().split())
+    print("== %s %s %s on '%s' with %d cores ==" % (
+        platform.machine(),
+        platform.system(),
+        platform.architecture()[0],
+        cpu,
+        CORES,
+    ))
+    name, lock, version = sys.thread_info
+    print("== Threads: %s, Lock: %s, Version: %s ==" % (name, lock, version))
+    check_interval = None if PYTHON_VERSION >= (3, 9) else sys.getcheckinterval()
+    switch_interval = None if PYTHON_VERSION < (3, 2) else sys.getswitchinterval()
+    print("== Check interval: %s, Switch interval: %s ==" % (check_interval, switch_interval))
+
+    if psutil:
+        analyse_psutil(psutil, *args, **kwargs)
+        analyse_psutil(psutil, *args, **kwargs)
+    else:
+        print("== Unable to document priority and affinity - run 'pip install psutil ==")
+
+    print()
+
+def analyse_psutil(psutil, new_affinity=None):
+    system = platform.system()
+    process = psutil.Process()
+    priority = process.nice()
+    io_priority = process.ionice()
+    affinity = process.cpu_affinity()
+    if system == "Windows":
+        priorities = {
+            psutil.IDLE_PRIORITY_CLASS: "Idle",
+            psutil.BELOW_NORMAL_PRIORITY_CLASS: "Below normal",
+            psutil.NORMAL_PRIORITY_CLASS: "Normal",
+            psutil.ABOVE_NORMAL_PRIORITY_CLASS: "Above normal",
+            psutil.HIGH_PRIORITY_CLASS: "High",
+            psutil.REALTIME_PRIORITY_CLASS: "Real-time",
+        }
+        io_priorities = {
+            psutil.IOPRIO_VERYLOW: "Very low",
+            psutil.IOPRIO_LOW: "Low",
+            psutil.IOPRIO_NORMAL: "Normal",
+            psutil.IOPRIO_HIGH: "High",
+        }
+        priority = priorities[priority] if priority in priorities else repr(priority)
+        io_priority = io_priorities[io_priority] if io_priority in io_priorities else repr(io_priority)
+    else:
+        io_priorities = {
+            psutil.IOPRIO_CLASS_NONE: "None",
+            psutil.IOPRIO_CLASS_IDLE: "Idle",
+            psutil.IOPRIO_CLASS_BE: "Best efforts",
+            psutil.IOPRIO_CLASS_RT: "Real-time",
+        }
+        priority = str(priority)
+        io_value = iopriority[1]
+        io_priority = io_priorities[io_priority[0]] if io_priority[0] in io_priorities else repr(io_priority)
+        if isinstance(io_value, int):
+            io_priority += ":%d" % io_value
+
+    print("== Cores: %d, Hyperthreads: %d, Priority: %s, I/O priority: %s, Affinity: %s ==" % (
+        psutil.cpu_count(False),
+        psutil.cpu_count(),
+        priority,
+        io_priority,
+        str(affinity),
+    ))
+
+    set_env(priority, io_priority, new_affinity)
+
+def set_env(priority="", io_priority="", new_affinity=None):
+    if not psutil:
+        return
+    explicit_affinity = new_affinity is not None
+    system = platform.system()
+    process = psutil.Process()
+    affinity = process.cpu_affinity()
+    if new_affinity is None:
+        new_affinity = list(range(CORES))
+    elif isinstance(new_affinity, int) and len(affinity) != int:
+        n = min(new_affinity, CORES)
+        # Just in case this is a hyperthreading processor (AFAIK max 2 hyperthreads per real core)
+        # we will use alternate threads as much as we can.
+        # This should make no real difference if the processor does not have hyperthreading
+        # NOTE: We are assuming that all cores are of equivalent power - if this is not the case
+        # user will have to set affinity explicitly using options as AFAIK there is no way
+        # to determine the details of individual cores in Python.
+        real_cores = int(CORES / 2)
+        if n == real_cores:
+            new_affinity = list(range(CORES))
+        elif n <= real_cores:
+            new_affinity = list(range(0, n * 2, 2))
+        else:
+            new_affinity = list(range(2 * n - CORES))
+            new_affinity.extend(list(range(2 * n - CORES, CORES, 2)))
+
+    # Attempt to adjust priorities to highest possible for benchmark to minimise impact of everything else
+    if system == "Windows":
+        if priority not in ("Real-time", "High"):
+            process.nice(psutil.HIGH_PRIORITY_CLASS)
+            if priority:
+                print("!! Process priority set to HIGH !!")
+        if io_priority != "High":
+            try:
+                process.ionice(psutil.IOPRIO_HIGH)
+                if io_priority:
+                    print("!! Process I/O priority set to HIGH !!")
+            except psutil.AccessDenied:
+                pass
+    else:
+        if priority != "-20":
+            process.nice(-20)
+            if priority:
+                print("!! Process nice set to -20 !!")
+        if not io_priority.startswith("Real-time"):
+            try:
+                process.ionice(psutil.IOPRIO_CLASS_RT, 0)
+                if io_priority:
+                    print("!! Process I/O priority set to REALTIME:0 !!")
+            except psutil.AccessDenied:
+                pass
+    if affinity != new_affinity:
+        process.cpu_affinity(new_affinity)
+        if explicit_affinity:
+            print("!! Process affinity set to %s !!" % str(new_affinity))
+
 
 def main():
     usage = "usage: %prog [-h|--help] [options]"
     parser = OptionParser(usage=usage)
     parser.add_option("-t", "--throughput",
-                      action="store_true", dest="throughput", default=False,
-                      help="run throughput tests")
+        action="store_true", dest="throughput", default=False,
+        help="run throughput tests"
+    )
     parser.add_option("-l", "--latency",
-                      action="store_true", dest="latency", default=False,
-                      help="run latency tests")
+        action="store_true", dest="latency", default=False,
+        help="run latency tests"
+    )
     parser.add_option("-b", "--bandwidth",
-                      action="store_true", dest="bandwidth", default=False,
-                      help="run I/O bandwidth tests")
+        action="store_true", dest="bandwidth", default=False,
+        help="run I/O bandwidth tests"
+    )
     parser.add_option("-i", "--interval",
-                      action="store", type="int", dest="check_interval", default=None,
-                      help="sys.setcheckinterval() value "
-                           "(Python 3.8 and older)")
+        action="store", type="int", dest="check_interval", default=None,
+        help="sys.setcheckinterval() value "
+            "(Python 3.8 and older)"
+    )
     parser.add_option("-I", "--switch-interval",
-                      action="store", type="float", dest="switch_interval", default=None,
-                      help="sys.setswitchinterval() value "
-                           "(Python 3.2 and newer)")
+        action="store", type="float", dest="switch_interval", default=None,
+        help="sys.setswitchinterval() value "
+            "(Python 3.2 and newer - default 0.005s)"
+    )
+    parser.add_option("-a", "--affinity",
+        action="store", type="str", dest="affinity", default=None,
+        help="process affinity - number of cores or a python list of cores that "
+            "this process should run on (default all processor cores "
+            "i.e. on this computer %d cores)." % CORES
+    )
     parser.add_option("-n", "--num-threads",
-                      action="store", type="int", dest="nthreads", default=4,
-                      help="max number of threads in tests")
+        action="store", type="int", dest="nthreads", default=None,
+        help="max number of threads in tests (default 2*affinity "
+        "i.e. %d threads on this computer)." % (CORES * 2)
+    )
+    parser.add_option("-p", "--ping-interval",
+        action="store", type="float", dest="ping_interval", default=LATENCY_PING_INTERVAL,
+        help="the delay in seconds between receiving a latency/ping response "
+            "and sending the next ping request (default %default sec). "
+            "A delay of (say) between 0.01s and 0.1s is likely representative of either/both "
+            "real I/O and GUI event loop activities such as moving the mouse."
+    )
+    parser.add_option("-e", "--echo-interval",
+        action="store", type="float", dest="echo_interval", default=BANDWIDTH_ECHO_INTERVAL,
+        help="the delay in seconds between receiving a bandwidth/echo response "
+            "and sending the next echo request (default %default sec). "
+            "If the echo-interval is >0.0, there is very little difference between "
+            "the ping and echo tests. "
+            "On the other hand, a 0.0 delay does not mimic any known real-life workload."
+    )
 
     # Hidden option to run the pinging and bandwidth clients
     parser.add_option("", "--latclient",
@@ -557,50 +809,56 @@ def main():
         parser.error("unexpected arguments")
 
     if options.latclient:
-        kwargs = eval(options.latclient)
-        latency_client(**kwargs)
+        set_env()
+        args = tuple(eval(options.latclient))
+        latency_client(*args)
         return
 
     if options.bwclient:
-        kwargs = eval(options.bwclient)
-        bandwidth_client(**kwargs)
+        set_env()
+        args = tuple(eval(options.bwclient))
+        bandwidth_client(*args)
         return
 
-    if not options.throughput and not options.latency and not options.bandwidth:
-        options.throughput = options.latency = options.bandwidth = True
+    if options.affinity:
+        options.affinity = eval(options.affinity)
+
+    if options.nthreads is None:
+        if options.affinity is None:
+            options.nthreads = 2 * CORES
+        elif isinstance(options.affinity, int):
+            options.nthreads = 2 * options.affinity
+        elif isinstance(options.affinity, list):
+            options.nthreads = 2 * len(options.affinity)
+        else:
+            options.nthreads = 2 * CORES
+
     if options.check_interval:
+        print("!! Setting check interval to %d !!" % options.check_interval)
         sys.setcheckinterval(options.check_interval)
     if options.switch_interval:
+        print("!! Setting switch interval to %.3fs !!" % options.switch_interval)
         sys.setswitchinterval(options.switch_interval)
 
-    print("== %s %s (%s) ==" % (
-        platform.python_implementation(),
-        platform.python_version(),
-        platform.python_build()[0],
-    ))
-    # Processor identification often has repeated spaces
-    cpu = ' '.join(platform.processor().split())
-    print("== %s %s on '%s' ==" % (
-        platform.machine(),
-        platform.system(),
-        cpu,
-    ))
-    print()
+    print_env(new_affinity=options.affinity)
 
+    if not options.throughput and not options.latency and not options.bandwidth:
+        options.throughput = options.latency = options.bandwidth = True
     if options.throughput:
         print("--- Throughput ---")
         print()
         run_throughput_tests(options.nthreads)
+        print()
 
     if options.latency:
         print("--- Latency ---")
         print()
-        run_latency_tests(options.nthreads)
+        run_latency_tests(options.nthreads, options.ping_interval)
 
     if options.bandwidth:
         print("--- I/O bandwidth ---")
         print()
-        run_bandwidth_tests(options.nthreads)
+        run_bandwidth_tests(options.nthreads, options.echo_interval)
 
 if __name__ == "__main__":
     main()

From 0a987914c4a8fbf9f7b46194dbaa07744729e704 Mon Sep 17 00:00:00 2001
From: Sophist <3001893+Sophist-UK@users.noreply.github.com>
Date: Sat, 26 Mar 2022 21:34:16 +0000
Subject: [PATCH 2/2] Tidy code a little

---
 Tools/ccbench/ccbench.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Tools/ccbench/ccbench.py b/Tools/ccbench/ccbench.py
index 3c4274cdd62804..1551c01ac91c16 100644
--- a/Tools/ccbench/ccbench.py
+++ b/Tools/ccbench/ccbench.py
@@ -31,10 +31,10 @@
     pass
 
 # psutil
-    try:
-        import psutil
-    except ImportError:
-        psutil = None
+try:
+    import psutil
+except ImportError:
+    psutil = None
 
 
 
@@ -42,7 +42,7 @@
 # Add the first two parts of the version number as a tuple for any version of Python
 # where the tests hang (because the same thread monopolises the GIL)
 YIELD_VERSIONS = [
-    # e.g. (2, 7)
+    # e.g. (2, 7),
 ]
 
 CORES = multiprocessing.cpu_count()
@@ -350,7 +350,8 @@ def _ping():
         # We give the parent a bit of time to notice.
         _sleep(1.0)
         for i in range(nb_pings):
-            _sleep(interval)
+            if interval:
+                _sleep(interval)
             _ping()
         _sendto(sock, LAT_END + "\n", addr)
     finally:
@@ -495,7 +496,8 @@ def _send_chunk(msg):
             _send_chunk(str(i))
             s = _recv(sock, packet_size)
             assert len(s) == packet_size
-            _sleep(interval)
+            if interval:
+                _sleep(interval)
             i += 1
         _send_chunk(BW_END)
     finally: