bpo-37531: regrtest ignores output on timeout (GH-16659) · python/cpython@359a197
@@ -22,6 +22,12 @@
2222PROGRESS_UPDATE = 30.0 # seconds
2323assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
242425+# Kill the main process after 5 minutes. It is supposed to write an update
26+# every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27+# buildbot workers.
28+MAIN_PROCESS_TIMEOUT = 5 * 60.0
29+assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30+2531# Time to wait until a worker completes: should be immediate
2632JOIN_TIMEOUT = 30.0 # seconds
2733@@ -160,20 +166,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
160166result = TestResult(test_name, error_type, test_time, None)
161167return MultiprocessResult(result, stdout, stderr, err_msg)
162168163-def _timedout(self, test_name):
164-self._kill()
165-166-stdout = stderr = ''
167-popen = self._popen
168-try:
169-stdout, stderr = popen.communicate(timeout=JOIN_TIMEOUT)
170-except (subprocess.TimeoutExpired, OSError) as exc:
171-print_warning(f"Failed to read {self} output "
172-f"(timeout={format_duration(JOIN_TIMEOUT)}): "
173-f"{exc!r}")
174-175-return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
176-177169def _run_process(self, test_name):
178170self.start_time = time.monotonic()
179171@@ -197,23 +189,32 @@ def _run_process(self, test_name):
197189198190try:
199191stdout, stderr = popen.communicate(timeout=self.timeout)
192+retcode = popen.returncode
193+assert retcode is not None
200194except subprocess.TimeoutExpired:
201195if self._stopped:
202196# kill() has been called: communicate() fails
203197# on reading closed stdout/stderr
204198raise ExitThread
205199206-return self._timedout(test_name)
200+# On timeout, kill the process
201+self._kill()
202+203+# None means TIMEOUT for the caller
204+retcode = None
205+# bpo-38207: Don't attempt to call communicate() again: on it
206+# can hang until all child processes using stdout and stderr
207+# pipes completes.
208+stdout = stderr = ''
207209except OSError:
208210if self._stopped:
209211# kill() has been called: communicate() fails
210212# on reading closed stdout/stderr
211213raise ExitThread
212214raise
213-214-retcode = popen.returncode
215-stdout = stdout.strip()
216-stderr = stderr.rstrip()
215+else:
216+stdout = stdout.strip()
217+stderr = stderr.rstrip()
217218218219return (retcode, stdout, stderr)
219220except:
@@ -225,13 +226,10 @@ def _run_process(self, test_name):
225226self.current_test_name = None
226227227228def _runtest(self, test_name):
228-result = self._run_process(test_name)
229-230-if isinstance(result, MultiprocessResult):
231-# _timedout() case
232-return result
229+retcode, stdout, stderr = self._run_process(test_name)
233230234-retcode, stdout, stderr = result
231+if retcode is None:
232+return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
235233236234err_msg = None
237235if retcode != 0:
@@ -250,7 +248,8 @@ def _runtest(self, test_name):
250248err_msg = "Failed to parse worker JSON: %s" % exc
251249252250if err_msg is not None:
253-return self.mp_result_error(test_name, CHILD_ERROR, stdout, stderr, err_msg)
251+return self.mp_result_error(test_name, CHILD_ERROR,
252+stdout, stderr, err_msg)
254253255254return MultiprocessResult(result, stdout, stderr, err_msg)
256255@@ -289,6 +288,16 @@ def _wait_completed(self):
289288f"{exc!r}")
290289291290def wait_stopped(self, start_time):
291+# bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
292+# which killed the process. Sometimes, killing the process from the
293+# main thread does not interrupt popen.communicate() in
294+# TestWorkerProcess thread. This loop with a timeout is a workaround
295+# for that.
296+#
297+# Moreover, if this method fails to join the thread, it is likely
298+# that Python will hang at exit while calling threading._shutdown()
299+# which tries again to join the blocked thread. Regrtest.main()
300+# uses EXIT_TIMEOUT to workaround this second bug.
292301while True:
293302# Write a message every second
294303self.join(1.0)
@@ -355,7 +364,8 @@ def _get_result(self):
355364timeout = PROGRESS_UPDATE
356365while True:
357366if use_faulthandler:
358-faulthandler.dump_traceback_later(timeout * 2.0, exit=True)
367+faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
368+exit=True)
359369360370# wait for a thread
361371try: