bpo-40521: Disable Unicode caches in isolated subinterpreters (GH-19933) · python/cpython@607b102
@@ -198,6 +198,11 @@ extern "C" {
198198# define OVERALLOCATE_FACTOR 4
199199#endif
200200201+/* bpo-40521: Interned strings are shared by all interpreters. */
202+#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
203+# define INTERNED_STRINGS
204+#endif
205+201206/* This dictionary holds all interned unicode strings. Note that references
202207 to strings in this dictionary are *not* counted in the string's ob_refcnt.
203208 When the interned string reaches a refcnt of 0 the string deallocation
@@ -206,7 +211,9 @@ extern "C" {
206211 Another way to look at this is that to say that the actual reference
207212 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
208213*/
214+#ifdef INTERNED_STRINGS
209215static PyObject *interned = NULL;
216+#endif
210217211218/* The empty Unicode object is shared to improve performance. */
212219static PyObject *unicode_empty = NULL;
@@ -281,9 +288,16 @@ unicode_decode_utf8(const char *s, Py_ssize_t size,
281288/* List of static strings. */
282289static _Py_Identifier *static_strings = NULL;
283290291+/* bpo-40521: Latin1 singletons are shared by all interpreters. */
292+#ifndef EXPERIMENTAL_ISOLATED_SUBINTERPRETERS
293+# define LATIN1_SINGLETONS
294+#endif
295+296+#ifdef LATIN1_SINGLETONS
284297/* Single character Unicode strings in the Latin-1 range are being
285298 shared as well. */
286299static PyObject *unicode_latin1[256] = {NULL};
300+#endif
287301288302/* Fast detection of the most frequent whitespace characters */
289303const unsigned char _Py_ascii_whitespace[] = {
@@ -662,6 +676,7 @@ unicode_result_ready(PyObject *unicode)
662676return unicode_empty;
663677 }
664678679+#ifdef LATIN1_SINGLETONS
665680if (length == 1) {
666681const void *data = PyUnicode_DATA(unicode);
667682int kind = PyUnicode_KIND(unicode);
@@ -683,6 +698,7 @@ unicode_result_ready(PyObject *unicode)
683698 }
684699 }
685700 }
701+#endif
686702687703assert(_PyUnicode_CheckConsistency(unicode, 1));
688704return unicode;
@@ -1913,10 +1929,12 @@ unicode_dealloc(PyObject *unicode)
19131929case SSTATE_INTERNED_MORTAL:
19141930/* revive dead object temporarily for DelItem */
19151931Py_SET_REFCNT(unicode, 3);
1932+#ifdef INTERNED_STRINGS
19161933if (PyDict_DelItem(interned, unicode) != 0) {
19171934_PyErr_WriteUnraisableMsg("deletion of interned string failed",
19181935NULL);
19191936 }
1937+#endif
19201938break;
1921193919221940case SSTATE_INTERNED_IMMORTAL:
@@ -1944,15 +1962,18 @@ unicode_dealloc(PyObject *unicode)
19441962static int
19451963unicode_is_singleton(PyObject *unicode)
19461964{
1947-PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1948-if (unicode == unicode_empty)
1965+if (unicode == unicode_empty) {
19491966return 1;
1967+ }
1968+#ifdef LATIN1_SINGLETONS
1969+PyASCIIObject *ascii = (PyASCIIObject *)unicode;
19501970if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
19511971 {
19521972Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
19531973if (ch < 256 && unicode_latin1[ch] == unicode)
19541974return 1;
19551975 }
1976+#endif
19561977return 0;
19571978}
19581979#endif
@@ -2094,16 +2115,28 @@ unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
20942115static PyObject*
20952116get_latin1_char(unsigned char ch)
20962117{
2097-PyObject *unicode = unicode_latin1[ch];
2118+PyObject *unicode;
2119+2120+#ifdef LATIN1_SINGLETONS
2121+unicode = unicode_latin1[ch];
2122+if (unicode) {
2123+Py_INCREF(unicode);
2124+return unicode;
2125+ }
2126+#endif
2127+2128+unicode = PyUnicode_New(1, ch);
20982129if (!unicode) {
2099-unicode = PyUnicode_New(1, ch);
2100-if (!unicode)
2101-return NULL;
2102-PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2103-assert(_PyUnicode_CheckConsistency(unicode, 1));
2104-unicode_latin1[ch] = unicode;
2130+return NULL;
21052131 }
2132+2133+PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2134+assert(_PyUnicode_CheckConsistency(unicode, 1));
2135+2136+#ifdef LATIN1_SINGLETONS
21062137Py_INCREF(unicode);
2138+unicode_latin1[ch] = unicode;
2139+#endif
21072140return unicode;
21082141}
21092142@@ -11270,7 +11303,6 @@ int
1127011303_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
1127111304{
1127211305PyObject *right_uni;
11273-Py_hash_t hash;
11274113061127511307assert(_PyUnicode_CHECK(left));
1127611308assert(right->string);
@@ -11302,10 +11334,12 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
1130211334if (PyUnicode_CHECK_INTERNED(left))
1130311335return 0;
113041133611337+#ifdef INTERNED_STRINGS
1130511338assert(_PyUnicode_HASH(right_uni) != -1);
11306-hash = _PyUnicode_HASH(left);
11339+Py_hash_t hash = _PyUnicode_HASH(left);
1130711340if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
1130811341return 0;
11342+#endif
11309113431131011344return unicode_compare_eq(left, right_uni);
1131111345}
@@ -15487,43 +15521,55 @@ void
1548715521PyUnicode_InternInPlace(PyObject **p)
1548815522{
1548915523PyObject *s = *p;
15490-PyObject *t;
1549115524#ifdef Py_DEBUG
1549215525assert(s != NULL);
1549315526assert(_PyUnicode_CHECK(s));
1549415527#else
15495-if (s == NULL || !PyUnicode_Check(s))
15528+if (s == NULL || !PyUnicode_Check(s)) {
1549615529return;
15530+ }
1549715531#endif
15532+1549815533/* If it's a subclass, we don't really know what putting
1549915534 it in the interned dict might do. */
15500-if (!PyUnicode_CheckExact(s))
15535+if (!PyUnicode_CheckExact(s)) {
1550115536return;
15502-if (PyUnicode_CHECK_INTERNED(s))
15537+ }
15538+15539+if (PyUnicode_CHECK_INTERNED(s)) {
1550315540return;
15541+ }
15542+15543+#ifdef INTERNED_STRINGS
1550415544if (interned == NULL) {
1550515545interned = PyDict_New();
1550615546if (interned == NULL) {
1550715547PyErr_Clear(); /* Don't leave an exception */
1550815548return;
1550915549 }
1551015550 }
15551+15552+PyObject *t;
1551115553Py_ALLOW_RECURSION
1551215554t = PyDict_SetDefault(interned, s, s);
1551315555Py_END_ALLOW_RECURSION
15556+1551415557if (t == NULL) {
1551515558PyErr_Clear();
1551615559return;
1551715560 }
15561+1551815562if (t != s) {
1551915563Py_INCREF(t);
1552015564Py_SETREF(*p, t);
1552115565return;
1552215566 }
15567+1552315568/* The two references in interned are not counted by refcnt.
1552415569 The deallocator will take care of this */
1552515570Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
1552615571_PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15572+#endif
1552715573}
15528155741552915575void
@@ -16109,9 +16155,11 @@ _PyUnicode_Fini(PyThreadState *tstate)
16109161551611016156Py_CLEAR(unicode_empty);
161111615716158+#ifdef LATIN1_SINGLETONS
1611216159for (Py_ssize_t i = 0; i < 256; i++) {
1611316160Py_CLEAR(unicode_latin1[i]);
1611416161 }
16162+#endif
1611516163_PyUnicode_ClearStaticStrings();
1611616164 }
1611716165