From d8a2b6628790c0df874a45b58a488418125749f8 Mon Sep 17 00:00:00 2001
From: Niels Dossche <7771979+nielsdos@users.noreply.github.com>
Date: Sun, 26 May 2024 13:51:40 +0200
Subject: [PATCH] Fix bug #79701: getElementById does not correctly work with
duplicate definitions
This is a long standing bug: IDs aren't properly tracked causing either
outdated or plain incorrect results from getElementById.
This PR implements a pragmatic solution in which we still try to use the
ID lookup table to a degree, but only as a performance boost not as a
"single source of truth". Full details are explained in the
getElementById code.
---
ext/dom/attr.c | 13 ++
ext/dom/document.c | 51 ++++---
ext/dom/element.c | 62 +++++---
ext/dom/internal_helpers.h | 27 ++++
ext/dom/node.c | 1 +
ext/dom/php_dom.h | 1 +
ext/dom/tests/bug79701/id_property.phpt | 53 +++++++
ext/dom/tests/bug79701/node.phpt | 18 +++
ext/dom/tests/bug79701/prepend.phpt | 29 ++++
ext/dom/tests/bug79701/remove_attribute.phpt | 21 +++
ext/dom/tests/bug79701/set_attr_value.phpt | 45 ++++++
.../tests/bug79701/set_attribute_html.phpt | 82 ++++++++++
.../tests/bug79701/set_attribute_ns_html.phpt | 85 +++++++++++
ext/dom/tests/bug79701/set_attribute_xml.phpt | 143 ++++++++++++++++++
ext/dom/tests/bug79701/swap.phpt | 47 ++++++
ext/dom/tests/bug79701/toggle.phpt | 17 +++
ext/dom/tests/bug79701/unconnected.phpt | 16 ++
17 files changed, 670 insertions(+), 41 deletions(-)
create mode 100644 ext/dom/tests/bug79701/id_property.phpt
create mode 100644 ext/dom/tests/bug79701/node.phpt
create mode 100644 ext/dom/tests/bug79701/prepend.phpt
create mode 100644 ext/dom/tests/bug79701/remove_attribute.phpt
create mode 100644 ext/dom/tests/bug79701/set_attr_value.phpt
create mode 100644 ext/dom/tests/bug79701/set_attribute_html.phpt
create mode 100644 ext/dom/tests/bug79701/set_attribute_ns_html.phpt
create mode 100644 ext/dom/tests/bug79701/set_attribute_xml.phpt
create mode 100644 ext/dom/tests/bug79701/swap.phpt
create mode 100644 ext/dom/tests/bug79701/toggle.phpt
create mode 100644 ext/dom/tests/bug79701/unconnected.phpt
diff --git a/ext/dom/attr.c b/ext/dom/attr.c
index 9cefed6b74798..99686acb61fd0 100644
--- a/ext/dom/attr.c
+++ b/ext/dom/attr.c
@@ -25,6 +25,7 @@
#include "php_dom.h"
#include "dom_properties.h"
+#include "internal_helpers.h"
/*
* class DOMAttr extends DOMNode
@@ -106,6 +107,16 @@ zend_result dom_attr_specified_read(dom_object *obj, zval *retval)
/* }}} */
+void dom_attr_value_will_change(dom_object *obj, xmlAttrPtr attrp)
+{
+ if (attrp->atype == XML_ATTRIBUTE_ID) {
+ xmlRemoveID(attrp->doc, attrp);
+ attrp->atype = XML_ATTRIBUTE_ID;
+ }
+
+ dom_mark_ids_modified(obj->document);
+}
+
/* {{{ value string
readonly=no
URL: http://www.w3.org/TR/2003/WD-DOM-Level-3-Core-20030226/DOM3-Core.html#ID-221662474
@@ -122,6 +133,8 @@ zend_result dom_attr_value_write(dom_object *obj, zval *newval)
{
DOM_PROP_NODE(xmlAttrPtr, attrp, obj);
+ dom_attr_value_will_change(obj, attrp);
+
/* Typed property, this is already a string */
ZEND_ASSERT(Z_TYPE_P(newval) == IS_STRING);
zend_string *str = Z_STR_P(newval);
diff --git a/ext/dom/document.c b/ext/dom/document.c
index 54a9e99a304d3..79149bef69b03 100644
--- a/ext/dom/document.c
+++ b/ext/dom/document.c
@@ -1028,34 +1028,47 @@ Since: DOM Level 2
*/
PHP_METHOD(DOMDocument, getElementById)
{
- zval *id;
xmlDocPtr docp;
- xmlAttrPtr attrp;
size_t idname_len;
dom_object *intern;
char *idname;
- id = ZEND_THIS;
- if (zend_parse_parameters(ZEND_NUM_ARGS(), "s", &idname, &idname_len) == FAILURE) {
- RETURN_THROWS();
- }
-
- DOM_GET_OBJ(docp, id, xmlDocPtr, intern);
+ ZEND_PARSE_PARAMETERS_START(1, 1)
+ Z_PARAM_STRING(idname, idname_len)
+ ZEND_PARSE_PARAMETERS_END();
- attrp = xmlGetID(docp, BAD_CAST idname);
+ DOM_GET_OBJ(docp, ZEND_THIS, xmlDocPtr, intern);
- /* From the moment an ID is created, libxml2's behaviour is to cache that element, even
- * if that element is not yet attached to the document. Similarly, only upon destruction of
- * the element the ID is actually removed by libxml2. Since libxml2 has such behaviour deeply
- * ingrained in the library, and uses the cache for various purposes, it seems like a bad
- * idea and lost cause to fight it. Instead, we'll simply walk the tree upwards to check
- * if the node is attached to the document. */
- if (attrp && attrp->parent && php_dom_is_node_connected(attrp->parent)) {
- DOM_RET_OBJ((xmlNodePtr) attrp->parent, intern);
+ /* If the document has not been manipulated yet, the ID cache will be in sync and we can trust its result.
+ * This check mainly exists because a lot of times people just query a document without modifying it,
+ * and we can allow quick access to IDs in that case. */
+ if (!dom_is_document_cache_modified_since_parsing(intern->document)) {
+ const xmlAttr *attrp = xmlGetID(docp, BAD_CAST idname);
+ if (attrp && attrp->parent) {
+ DOM_RET_OBJ(attrp->parent, intern);
+ }
} else {
- RETVAL_NULL();
- }
+ /* From the moment an ID is created, libxml2's behaviour is to cache that element, even
+ * if that element is not yet attached to the document. Similarly, only upon destruction of
+ * the element the ID is actually removed by libxml2. Since libxml2 has such behaviour deeply
+ * ingrained in the library, and uses the cache for various purposes, it seems like a bad
+ * idea and lost cause to fight it. */
+
+ const xmlNode *base = (const xmlNode *) docp;
+ const xmlNode *node = base->children;
+ while (node != NULL) {
+ if (node->type == XML_ELEMENT_NODE) {
+ for (const xmlAttr *attr = node->properties; attr != NULL; attr = attr->next) {
+ if (attr->atype == XML_ATTRIBUTE_ID && dom_compare_value(attr, BAD_CAST idname)) {
+ DOM_RET_OBJ((xmlNodePtr) node, intern);
+ return;
+ }
+ }
+ }
+ node = php_dom_next_in_tree_order(node, base);
+ }
+ }
}
/* }}} end dom_document_get_element_by_id */
diff --git a/ext/dom/element.c b/ext/dom/element.c
index 73a7d142b7566..c20bcb013088a 100644
--- a/ext/dom/element.c
+++ b/ext/dom/element.c
@@ -183,7 +183,7 @@ zend_result dom_element_id_read(dom_object *obj, zval *retval)
return dom_element_reflected_attribute_read(obj, retval, "id");
}
-static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id);
+static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id, php_libxml_ref_obj *document);
zend_result dom_element_id_write(dom_object *obj, zval *newval)
{
@@ -191,7 +191,7 @@ zend_result dom_element_id_write(dom_object *obj, zval *newval)
if (!attr) {
return FAILURE;
}
- php_set_attribute_id(attr, true);
+ php_set_attribute_id(attr, true, obj->document);
return SUCCESS;
}
/* }}} */
@@ -352,6 +352,16 @@ static xmlNodePtr dom_create_attribute(xmlNodePtr nodep, const char *name, const
}
}
+static void dom_check_register_attribute_id(xmlAttrPtr attr, php_libxml_ref_obj *document)
+{
+ dom_mark_ids_modified(document);
+
+ if (attr->atype != XML_ATTRIBUTE_ID && attr->doc->type == XML_HTML_DOCUMENT_NODE && attr->ns == NULL && xmlStrEqual(attr->name, BAD_CAST "id")) {
+ /* To respect XML's ID behaviour, we only do this registration for HTML documents. */
+ attr->atype = XML_ATTRIBUTE_ID;
+ }
+}
+
/* {{{ URL: http://www.w3.org/TR/2003/WD-DOM-Level-3-Core-20030226/DOM3-Core.html#core-ID-F68F082
Modern spec URL: https://dom.spec.whatwg.org/#dom-element-setattribute
Since:
@@ -360,7 +370,6 @@ PHP_METHOD(DOMElement, setAttribute)
{
zval *id;
xmlNode *nodep;
- xmlNodePtr attr = NULL;
int name_valid;
size_t name_len, value_len;
dom_object *intern;
@@ -394,23 +403,28 @@ PHP_METHOD(DOMElement, setAttribute)
}
/* Can't use xmlSetNsProp unconditionally here because that doesn't take into account the qualified name matching... */
- attr = (xmlNodePtr) php_dom_get_attribute_node(nodep, BAD_CAST name, name_len);
+ xmlAttrPtr attr = php_dom_get_attribute_node(nodep, BAD_CAST name, name_len);
if (attr != NULL) {
- dom_remove_all_children(attr);
+ dom_attr_value_will_change(intern, attr);
+ dom_remove_all_children((xmlNodePtr) attr);
xmlNodePtr node = xmlNewDocText(attr->doc, BAD_CAST value);
- xmlAddChild(attr, node);
+ xmlAddChild((xmlNodePtr) attr, node);
} else {
- attr = (xmlNodePtr) xmlSetNsProp(nodep, NULL, name_processed, BAD_CAST value);
+ attr = xmlSetNsProp(nodep, NULL, name_processed, BAD_CAST value);
+ if (EXPECTED(attr != NULL)) {
+ dom_check_register_attribute_id(attr, intern->document);
+ }
}
if (name_processed != BAD_CAST name) {
efree(name_processed);
}
} else {
- attr = dom_get_attribute_or_nsdecl(intern, nodep, BAD_CAST name, name_len);
+ xmlNodePtr attr = dom_get_attribute_or_nsdecl(intern, nodep, BAD_CAST name, name_len);
if (attr != NULL) {
switch (attr->type) {
case XML_ATTRIBUTE_NODE:
+ dom_attr_value_will_change(intern, (xmlAttrPtr) attr);
node_list_unlink(attr->children);
break;
case XML_NAMESPACE_DECL:
@@ -693,7 +707,10 @@ static void dom_element_set_attribute_node_common(INTERNAL_FUNCTION_PARAMETERS,
xmlAddChild(nodep, (xmlNodePtr) attrp);
if (!modern) {
+ dom_mark_ids_modified(intern->document);
php_dom_reconcile_attribute_namespace_after_insertion(attrp);
+ } else {
+ dom_check_register_attribute_id(attrp, intern->document);
}
/* Returns old property if removed otherwise NULL */
@@ -870,6 +887,8 @@ static void dom_set_attribute_ns_legacy(dom_object *intern, xmlNodePtr elemp, ch
int errorcode = dom_check_qname(name, &localname, &prefix, uri_len, name_len);
if (errorcode == 0) {
+ dom_mark_ids_modified(intern->document);
+
if (uri_len > 0) {
nodep = (xmlNodePtr) xmlHasNsProp(elemp, BAD_CAST localname, BAD_CAST uri);
if (nodep != NULL && nodep->type != XML_ATTRIBUTE_DECL) {
@@ -958,8 +977,11 @@ static void dom_set_attribute_ns_modern(dom_object *intern, xmlNodePtr elemp, ze
if (errorcode == 0) {
php_dom_libxml_ns_mapper *ns_mapper = php_dom_get_ns_mapper(intern);
xmlNsPtr ns = php_dom_libxml_ns_mapper_get_ns_raw_prefix_string(ns_mapper, prefix, xmlStrlen(prefix), uri);
- if (UNEXPECTED(xmlSetNsProp(elemp, ns, localname, BAD_CAST value) == NULL)) {
+ xmlAttrPtr attr = xmlSetNsProp(elemp, ns, localname, BAD_CAST value);
+ if (UNEXPECTED(attr == NULL)) {
php_dom_throw_error(INVALID_STATE_ERR, /* strict */ true);
+ } else {
+ dom_check_register_attribute_id(attr, intern->document);
}
} else {
php_dom_throw_error(errorcode, /* strict */ true);
@@ -1270,20 +1292,16 @@ PHP_METHOD(DOMElement, hasAttributeNS)
}
/* }}} end dom_element_has_attribute_ns */
-static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id) /* {{{ */
+static void php_set_attribute_id(xmlAttrPtr attrp, bool is_id, php_libxml_ref_obj *document) /* {{{ */
{
- if (is_id == 1 && attrp->atype != XML_ATTRIBUTE_ID) {
- xmlChar *id_val;
-
- id_val = xmlNodeListGetString(attrp->doc, attrp->children, 1);
- if (id_val != NULL) {
- xmlAddID(NULL, attrp->doc, id_val, attrp);
- xmlFree(id_val);
- }
- } else if (is_id == 0 && attrp->atype == XML_ATTRIBUTE_ID) {
+ if (is_id && attrp->atype != XML_ATTRIBUTE_ID) {
+ attrp->atype = XML_ATTRIBUTE_ID;
+ } else if (!is_id && attrp->atype == XML_ATTRIBUTE_ID) {
xmlRemoveID(attrp->doc, attrp);
attrp->atype = 0;
}
+
+ dom_mark_ids_modified(document);
}
/* }}} */
@@ -1311,7 +1329,7 @@ PHP_METHOD(DOMElement, setIdAttribute)
if (attrp == NULL || attrp->type == XML_ATTRIBUTE_DECL) {
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
} else {
- php_set_attribute_id(attrp, is_id);
+ php_set_attribute_id(attrp, is_id, intern->document);
}
}
/* }}} end dom_element_set_id_attribute */
@@ -1340,7 +1358,7 @@ PHP_METHOD(DOMElement, setIdAttributeNS)
if (attrp == NULL || attrp->type == XML_ATTRIBUTE_DECL) {
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
} else {
- php_set_attribute_id(attrp, is_id);
+ php_set_attribute_id(attrp, is_id, intern->document);
}
}
/* }}} end dom_element_set_id_attribute_ns */
@@ -1367,7 +1385,7 @@ static void dom_element_set_id_attribute_node(INTERNAL_FUNCTION_PARAMETERS, zend
if (attrp->parent != nodep) {
php_dom_throw_error(NOT_FOUND_ERR, dom_get_strict_error(intern->document));
} else {
- php_set_attribute_id(attrp, is_id);
+ php_set_attribute_id(attrp, is_id, intern->document);
}
}
diff --git a/ext/dom/internal_helpers.h b/ext/dom/internal_helpers.h
index 0f67550898ab4..a19367dd47770 100644
--- a/ext/dom/internal_helpers.h
+++ b/ext/dom/internal_helpers.h
@@ -63,4 +63,31 @@ DOM_DEF_GET_CE_FUNC(domimplementation)
#endif
+static zend_always_inline size_t dom_minimum_modification_nr_since_parsing(php_libxml_ref_obj *doc_ptr)
+{
+ /* For old-style DOM, we need a "new DOMDocument" + a load, so the minimum modification nr is 2.
+ * For new-style DOM, we need only to call a named constructor, so the minimum modification nr is 1. */
+ return doc_ptr->class_type == PHP_LIBXML_CLASS_MODERN ? 1 : 2;
+}
+
+static zend_always_inline void dom_mark_document_cache_as_modified_since_parsing(php_libxml_ref_obj *doc_ptr)
+{
+ if (doc_ptr) {
+ doc_ptr->cache_tag.modification_nr = MAX(dom_minimum_modification_nr_since_parsing(doc_ptr) + 1,
+ doc_ptr->cache_tag.modification_nr);
+ }
+}
+
+/* Marks the ID cache as potentially stale */
+static zend_always_inline void dom_mark_ids_modified(php_libxml_ref_obj *doc_ptr)
+{
+ /* Although this is currently a wrapper function, it's best to abstract the actual implementation away. */
+ dom_mark_document_cache_as_modified_since_parsing(doc_ptr);
+}
+
+static zend_always_inline bool dom_is_document_cache_modified_since_parsing(php_libxml_ref_obj *doc_ptr)
+{
+ return !doc_ptr || doc_ptr->cache_tag.modification_nr > dom_minimum_modification_nr_since_parsing(doc_ptr);
+}
+
#endif
diff --git a/ext/dom/node.c b/ext/dom/node.c
index 14992da952676..5200a4d3cc49a 100644
--- a/ext/dom/node.c
+++ b/ext/dom/node.c
@@ -185,6 +185,7 @@ zend_result dom_node_node_value_write(dom_object *obj, zval *newval)
/* Access to Element node is implemented as a convenience method */
switch (nodep->type) {
case XML_ATTRIBUTE_NODE:
+ dom_attr_value_will_change(obj, (xmlAttrPtr) nodep);
if (php_dom_follow_spec_intern(obj)) {
dom_remove_all_children(nodep);
xmlAddChild(nodep, xmlNewTextLen(BAD_CAST ZSTR_VAL(str), ZSTR_LEN(str)));
diff --git a/ext/dom/php_dom.h b/ext/dom/php_dom.h
index 27c603a3d9d40..1978e94d1b3e3 100644
--- a/ext/dom/php_dom.h
+++ b/ext/dom/php_dom.h
@@ -174,6 +174,7 @@ xmlDocPtr php_dom_create_html_doc(void);
xmlEntityPtr dom_entity_reference_fetch_and_sync_declaration(xmlNodePtr reference);
void dom_set_xml_class(php_libxml_ref_obj *document);
bool dom_compare_value(const xmlAttr *attr, const xmlChar *value);
+void dom_attr_value_will_change(dom_object *obj, xmlAttrPtr attrp);
typedef enum {
DOM_LOAD_STRING = 0,
diff --git a/ext/dom/tests/bug79701/id_property.phpt b/ext/dom/tests/bug79701/id_property.phpt
new file mode 100644
index 0000000000000..035a6c82f0f73
--- /dev/null
+++ b/ext/dom/tests/bug79701/id_property.phpt
@@ -0,0 +1,53 @@
+--TEST--
+Bug #79701 (getElementById does not correctly work with duplicate definitions) - id property variation
+--EXTENSIONS--
+dom
+--FILE--
+
+