5.2. XML Module lxml — Python
pip install lxml
5.2.1. Creating elements
Creating elements:
from lxml.etree import tostring, Element root = Element("iris") print(tostring(root)) # b'<iris/>'
Adding elements using list interface:
from lxml.etree import tostring, Element root = Element('iris') root.append(Element('setosa')) root.append(Element('versicolor')) root.append(Element('virginica')) print(tostring(root)) # b'<iris><setosa/><versicolor/><virginica/></iris>'
5.2.2. Length of a subtree
Length of a subtree:
from lxml.etree import Element root = Element('iris') root.append(Element('setosa')) root.append(Element('versicolor')) root.append(Element('virginica')) print(len(root)) # 3
5.2.3. Selecting subtree
Selecting subtree:
from lxml.etree import Element root = Element('iris') root.append(Element('setosa')) root.append(Element('versicolor')) root.append(Element('virginica')) selected = root[2] print(selected.tag) # virginica
Where is selected element:
from lxml.etree import Element root = Element('iris') root.append(Element('setosa')) root.append(Element('versicolor')) root.append(Element('virginica')) selected = root[1] root.index(selected) # 1 selected = root[2] root.index(selected) # 2
5.2.4. Element tree as a lists
Elements are lists:
from lxml.etree import tostring, Element root = Element('iris") root.append(Element('setosa")) root.append(Element('versicolor")) root.append(Element('virginica")) children = list(root) print(children) # [ # <Element setosa at 0x113cd4048>, # <Element versicolor at 0x113cd4188>, # <Element virginica at 0x113cd41c8> # ]
Iterating over elements:
from lxml.etree import Element root = Element("iris") root.append(Element("setosa")) root.append(Element("versicolor")) root.append(Element("virginica")) for child in root: print(child.tag) # setosa # versicolor # virginica
Slicing elements:
from lxml.etree import Element root = Element("iris") root.append(Element("setosa")) root.append(Element("versicolor")) root.append(Element("virginica")) root.insert(0, Element("arctica")) start = root[:1] end = root[-1:] print(start[0].tag) # arctica print(end[0].tag) # virginica
5.2.5. Elements as a dict
Create element using dict interface:
from lxml.etree import tostring, Element tag = Element("iris", kingdom="plantae") print(tostring(tag)) # b'<iris kingdom="plantae"/>'
Get element attributes and values:
from lxml.etree import tostring, Element tag = Element("iris", kingdom="plantae") print(tag.get("kingdom")) # plantae print(tag.get("not-existing")) # None
Set element attributes and values:
from lxml.etree import tostring, Element tag = Element("iris", kingdom="plantae") tag.set("kind", "flower") print(tag.get("kind")) # flower print(tostring(tag)) # b'<iris kingdom="plantae" kind="flower"/>'
Elements carry attributes as a dict:
from lxml.etree import Element tag = Element("iris", kingdom="plantae") tag.set("kind", "flower") tag.keys() # ['kind', 'kingdom'] tag.values() # ['plantae', 'flower'] tag.items() # [('kingdom', 'plantae'), ('kind', 'flower')]
Iterating over element attributes and values:
from lxml.etree import Element tag = Element("iris", kingdom="plantae") tag.set("kind", "flower") for key, value in tag.items(): print(f'{key} -> {value}') # kingdom -> plantae # kind -> flower
Elements carry attributes as a dict:
from lxml.etree import Element tag = Element("iris", kingdom="plantae") tag.set("kind", "flower") tag.attrib['kingdom'] # 'plantae' tag.attrib['not-existing'] # Traceback (most recent call last): # KeyError: 'not-existing' tag.attrib['species'] = 'Setosa' tag.attrib.get('species') # 'Setosa' tag.attrib # {'kingdom': 'plantae', 'kind': 'flower'} tag.attrib.items() # [('kingdom', 'plantae'), ('kind', 'flower'), ('species', 'Setosa')]
5.2.6. Elements contain text
from lxml.etree import tostring, Element tag = Element("iris") tag.text = "Setosa" tag.text # 'Setosa' tostring(tag) # b'<iris>Setosa</iris>'
5.2.7. Tree iteration
from lxml.etree import tostring, Element, SubElement root = Element("iris") SubElement(root, "species").text = "Setosa" SubElement(root, "species").text = "Virginica" SubElement(root, "flower").text = "Versicolor" print(tostring(root, pretty_print=True)) # b'<iris> # <species>Setosa</species> # <species>Virginica</species> # <flower>Versicolor</flower> # </iris>' for element in root.iter(): print(f'{element.tag} -> {element.text}') # iris -> None # species -> Setosa # species -> Virginica # flower -> Versicolor for element in root.iter("species"): print(f'{element.tag} -> {element.text}') # species -> Setosa # species -> Virginica for element in root.iter("species", "flower"): print(f'{element.tag} -> {element.text}') # species -> Setosa # species -> Virginica # flower -> Versicolor
5.2.8. Entities
from lxml.etree import tostring, Element, SubElement, Entity root = Element("iris") print(tostring(root)) # b'<iris/>' root.append(Entity("#234")) print(tostring(root)) # b'<iris>ê</iris>'
5.2.10. Serialization
from lxml.etree import tostring, XML root = XML('<root><a><b/></a></root>') tostring(root) # b'<root><a><b/></a></root>' print(tostring(root, xml_declaration=True)) # b"<?xml version='1.0' encoding='ASCII'?>\n<root><a><b/></a></root>" print(tostring(root, encoding='utf-8')) # b'<root><a><b/></a></root>' print(tostring(root, encoding='iso-8859-2')) # b"<?xml version='1.0' encoding='iso-8859-2'?>\n<root><a><b/></a></root>" print(tostring(root, pretty_print=True)) # b'<root>\n <a>\n <b/>\n </a>\n</root>\n' print(tostring(root, pretty_print=True).decode()) # <root> # <a> # <b/> # </a> # </root>
from lxml.etree import tostring, XML root = XML('<html><head/><body><p>Hello<br/>World</p></body></html>') # default: method = 'xml' tostring(root) # b'<html><head/><body><p>Hello<br/>World</p></body></html>' tostring(root, method='xml') # b'<html><head/><body><p>Hello<br/>World</p></body></html>' tostring(root, method='html') # b'<html><head></head><body><p>Hello<br>World</p></body></html>' print(tostring(root, method='html', pretty_print=True)) # b'<html>\n<head></head>\n<body><p>Hello<br>World</p></body>\n</html>\n' print(tostring(root, method='html', pretty_print=True).decode()) # <html> # <head></head> # <body><p>Hello<br>World</p></body> # </html> tostring(root, method='text') # b'HelloWorld'