bpo-37531: regrtest ignores output on timeout (GH-16659) · python/cpython@598bfa4
@@ -22,6 +22,12 @@
2222PROGRESS_UPDATE = 30.0 # seconds
2323assert PROGRESS_UPDATE >= PROGRESS_MIN_TIME
242425+# Kill the main process after 5 minutes. It is supposed to write an update
26+# every PROGRESS_UPDATE seconds. Tolerate 5 minutes for Python slowest
27+# buildbot workers.
28+MAIN_PROCESS_TIMEOUT = 5 * 60.0
29+assert MAIN_PROCESS_TIMEOUT >= PROGRESS_UPDATE
30+2531# Time to wait until a worker completes: should be immediate
2632JOIN_TIMEOUT = 30.0 # seconds
2733@@ -159,20 +165,6 @@ def mp_result_error(self, test_name, error_type, stdout='', stderr='',
159165result = TestResult(test_name, error_type, test_time, None)
160166return MultiprocessResult(result, stdout, stderr, err_msg)
161167162-def _timedout(self, test_name):
163-self._kill()
164-165-stdout = stderr = ''
166-popen = self._popen
167-try:
168-stdout, stderr = popen.communicate(timeout=JOIN_TIMEOUT)
169-except (subprocess.TimeoutExpired, OSError) as exc:
170-print_warning(f"Failed to read {self} output "
171-f"(timeout={format_duration(JOIN_TIMEOUT)}): "
172-f"{exc!r}")
173-174-return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
175-176168def _run_process(self, test_name):
177169self.start_time = time.monotonic()
178170@@ -195,23 +187,32 @@ def _run_process(self, test_name):
195187196188try:
197189stdout, stderr = popen.communicate(timeout=self.timeout)
190+retcode = popen.returncode
191+assert retcode is not None
198192except subprocess.TimeoutExpired:
199193if self._stopped:
200194# kill() has been called: communicate() fails
201195# on reading closed stdout/stderr
202196raise ExitThread
203197204-return self._timedout(test_name)
198+# On timeout, kill the process
199+self._kill()
200+201+# None means TIMEOUT for the caller
202+retcode = None
203+# bpo-38207: Don't attempt to call communicate() again: on it
204+# can hang until all child processes using stdout and stderr
205+# pipes completes.
206+stdout = stderr = ''
205207except OSError:
206208if self._stopped:
207209# kill() has been called: communicate() fails
208210# on reading closed stdout/stderr
209211raise ExitThread
210212raise
211-212-retcode = popen.returncode
213-stdout = stdout.strip()
214-stderr = stderr.rstrip()
213+else:
214+stdout = stdout.strip()
215+stderr = stderr.rstrip()
215216216217return (retcode, stdout, stderr)
217218except:
@@ -223,13 +224,10 @@ def _run_process(self, test_name):
223224self.current_test_name = None
224225225226def _runtest(self, test_name):
226-result = self._run_process(test_name)
227-228-if isinstance(result, MultiprocessResult):
229-# _timedout() case
230-return result
227+retcode, stdout, stderr = self._run_process(test_name)
231228232-retcode, stdout, stderr = result
229+if retcode is None:
230+return self.mp_result_error(test_name, TIMEOUT, stdout, stderr)
233231234232err_msg = None
235233if retcode != 0:
@@ -248,7 +246,8 @@ def _runtest(self, test_name):
248246err_msg = "Failed to parse worker JSON: %s" % exc
249247250248if err_msg is not None:
251-return self.mp_result_error(test_name, CHILD_ERROR, stdout, stderr, err_msg)
249+return self.mp_result_error(test_name, CHILD_ERROR,
250+stdout, stderr, err_msg)
252251253252return MultiprocessResult(result, stdout, stderr, err_msg)
254253@@ -287,6 +286,16 @@ def _wait_completed(self):
287286f"{exc!r}")
288287289288def wait_stopped(self, start_time):
289+# bpo-38207: MultiprocessTestRunner.stop_workers() called self.stop()
290+# which killed the process. Sometimes, killing the process from the
291+# main thread does not interrupt popen.communicate() in
292+# TestWorkerProcess thread. This loop with a timeout is a workaround
293+# for that.
294+#
295+# Moreover, if this method fails to join the thread, it is likely
296+# that Python will hang at exit while calling threading._shutdown()
297+# which tries again to join the blocked thread. Regrtest.main()
298+# uses EXIT_TIMEOUT to workaround this second bug.
290299while True:
291300# Write a message every second
292301self.join(1.0)
@@ -353,7 +362,8 @@ def _get_result(self):
353362timeout = PROGRESS_UPDATE
354363while True:
355364if use_faulthandler:
356-faulthandler.dump_traceback_later(timeout * 2.0, exit=True)
365+faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
366+exit=True)
357367358368# wait for a thread
359369try: