open62541_XMLPreprocessor.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # This Source Code Form is subject to the terms of the Mozilla Public
  4. # License, v. 2.0. If a copy of the MPL was not distributed with this
  5. # file, You can obtain one at http://mozilla.org/MPL/2.0/.
  6. ###
  7. ### Author: Chris Iatrou (ichrispa@core-vector.net)
  8. ###
  9. ### This program was created for educational purposes and has been
  10. ### contributed to the open62541 project by the author. All licensing
  11. ### terms for this source is inherited by the terms and conditions
  12. ### specified for by the open62541 project (see the projects readme
  13. ### file for more information on the MPLv2 terms and restrictions).
  14. ###
  15. ### This program is not meant to be used in a production environment. The
  16. ### author is not liable for any complications arising due to the use of
  17. ### this program.
  18. ###
  19. import logging
  20. logger = logging.getLogger(__name__)
  21. from ua_constants import *
  22. import tempfile
  23. import xml.dom.minidom as dom
  24. import os
  25. import string
  26. from collections import Counter
  27. import re
  28. from ua_namespace import opcua_node_id_t
  29. class preProcessDocument:
  30. originXML = '' # Original XML passed to the preprocessor
  31. targetXML = () # tuple of (fileHandle, fileName)
  32. nodeset = '' # Parsed DOM XML object
  33. parseOK = False;
  34. containedNodes = [] # contains tuples of (opcua_node_id_t, xmlelement)
  35. referencedNodes = [] # contains tuples of (opcua_node_id_t, xmlelement)
  36. namespaceOrder = [] # contains xmlns:sX attributed as tuples (int ns, string name)
  37. namespaceQualifiers = [] # contains all xmlns:XYZ qualifiers that might prefix value aliases (like "<uax:Int32>")
  38. referencedNamesSpaceUris = [] # contains <NamespaceUris> URI elements
  39. def __init__(self, originXML):
  40. self.originXML = originXML
  41. self.targetXML = tempfile.mkstemp(prefix=os.path.basename(originXML)+"_preProcessed-" ,suffix=".xml")
  42. self.parseOK = True
  43. self.containedNodes = []
  44. self.referencedNodes = []
  45. self.namespaceOrder = []
  46. self.referencedNamesSpaceUris = []
  47. self.namespaceQualifiers = []
  48. try:
  49. self.nodeset = dom.parse(originXML)
  50. if len(self.nodeset.getElementsByTagName("UANodeSet")) == 0 or len(self.nodeset.getElementsByTagName("UANodeSet")) > 1:
  51. logger.error(self, "Document " + self.targetXML[1] + " contains no or more then 1 nodeset", LOG_LEVEL_ERROR)
  52. self.parseOK = False
  53. except:
  54. self.parseOK = False
  55. logger.debug("Adding new document to be preprocessed " + os.path.basename(originXML) + " as " + self.targetXML[1])
  56. def clean(self):
  57. #os.close(self.targetXML[0]) Don't -> done to flush() after finalize()
  58. os.remove(self.targetXML[1])
  59. def getTargetXMLName(self):
  60. if (self.parseOK):
  61. return self.targetXML[1]
  62. return None
  63. def extractNamespaceURIs(self):
  64. """ minidom gobbles up <NamespaceUris></NamespaceUris> elements, without a decent
  65. way to reliably access this dom2 <uri></uri> elements (only attribute xmlns= are
  66. accessible using minidom). We need them for dereferencing though... This
  67. function attempts to do just that.
  68. """
  69. infile = open(self.originXML)
  70. foundURIs = False
  71. nsline = ""
  72. line = infile.readline()
  73. for line in infile:
  74. if "<namespaceuris>" in line.lower():
  75. foundURIs = True
  76. elif "</namespaceuris>" in line.lower():
  77. foundURIs = False
  78. nsline = nsline + line
  79. break
  80. if foundURIs:
  81. nsline = nsline + line
  82. if len(nsline) > 0:
  83. ns = dom.parseString(nsline).getElementsByTagName("NamespaceUris")
  84. for uri in ns[0].childNodes:
  85. if uri.nodeType != uri.ELEMENT_NODE:
  86. continue
  87. self.referencedNamesSpaceUris.append(uri.firstChild.data)
  88. infile.close()
  89. def analyze(self):
  90. """ analyze will gather information about the nodes and references contained in a XML File
  91. to facilitate later preprocessing stages that adresss XML dependency issues
  92. """
  93. nodeIds = []
  94. ns = self.nodeset.getElementsByTagName("UANodeSet")
  95. # We need to find out what the namespace calls itself and other referenced, as numeric id's are pretty
  96. # useless sans linked nodes. There is two information sources...
  97. self.extractNamespaceURIs() # From <URI>...</URI> definitions
  98. for key in ns[0].attributes.keys(): # from xmlns:sX attributes
  99. if "xmlns:" in key: # Any key: we will be removing these qualifiers from Values later
  100. self.namespaceQualifiers.append(key.replace("xmlns:",""))
  101. if "xmlns:s" in key: # get a numeric nsId and modelname/uri
  102. self.namespaceOrder.append((int(key.replace("xmlns:s","")), re.sub("[A-Za-z0-9-_\.]+\.[xXsSdD]{3}$","",ns[0].getAttribute(key))))
  103. # Get all nodeIds contained in this XML
  104. for nd in ns[0].childNodes:
  105. if nd.nodeType != nd.ELEMENT_NODE:
  106. continue
  107. if nd.hasAttribute(u'NodeId'):
  108. self.containedNodes.append( (opcua_node_id_t(nd.getAttribute(u'NodeId')), nd) )
  109. refs = nd.getElementsByTagName(u'References')[0]
  110. for ref in refs.childNodes:
  111. if ref.nodeType == ref.ELEMENT_NODE:
  112. self.referencedNodes.append( (opcua_node_id_t(ref.firstChild.data), ref) )
  113. logger.debug("Nodes: " + str(len(self.containedNodes)) + " References: " + str(len(self.referencedNodes)))
  114. def getNamespaceId(self):
  115. """ Counts the namespace IDs in all nodes of this XML and picks the most used
  116. namespace as the numeric identifier of this data model.
  117. returns: Integer ID of the most propable/most used namespace in this XML
  118. """
  119. max = 0;
  120. namespaceIdGuessed = 0;
  121. idDict = {}
  122. for ndid in self.containedNodes:
  123. if not ndid[0].ns in idDict.keys():
  124. idDict[ndid[0].ns] = 1
  125. else:
  126. idDict[ndid[0].ns] = idDict[ndid[0].ns] + 1
  127. for entry in idDict:
  128. if idDict[entry] > max:
  129. max = idDict[entry]
  130. namespaceIdGuessed = entry
  131. return namespaceIdGuessed
  132. def getReferencedNamespaceUri(self, nsId):
  133. """ Returns an URL that hopefully corresponds to the nsId that was used to reference this model """
  134. # Might be the more reliable method: Get the URI from the xmlns attributes (they have numers)
  135. if len(self.namespaceOrder) > 0:
  136. for el in self.namespaceOrder:
  137. if el[0] == nsId:
  138. return el[1]
  139. # Fallback: Some models do not have xmlns:sX attributes, but still <URI>s
  140. # (usually when they only reference NS0)
  141. if len(self.referencedNamesSpaceUris) > 0 and len(self.referencedNamesSpaceUris) >= nsId-1:
  142. return self.referencedNamesSpaceUris[nsId-1]
  143. #Nope, not found.
  144. return ""
  145. def getNamespaceDependencies(self):
  146. deps = []
  147. for ndid in self.referencedNodes:
  148. if not ndid[0].ns in deps:
  149. deps.append(ndid[0].ns)
  150. return deps
  151. def finalize(self):
  152. outfile = self.targetXML[0]
  153. outline = self.nodeset.toxml()
  154. for qualifier in self.namespaceQualifiers:
  155. rq = qualifier+":"
  156. outline = outline.replace(rq, "")
  157. os.write(outfile, outline.encode('UTF-8'))
  158. os.close(outfile)
  159. def reassignReferencedNamespaceId(self, currentNsId, newNsId):
  160. """ Iterates over all references in this document, find references to currentNsId and changes them to newNsId.
  161. NodeIds themselves are not altered.
  162. returns: nothing
  163. """
  164. for refNd in self.referencedNodes:
  165. if refNd[0].ns == currentNsId:
  166. refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
  167. refNd[0].ns = newNsId
  168. refNd[0].toString()
  169. def reassignNamespaceId(self, currentNsId, newNsId):
  170. """ Iterates over all nodes in this document, find those in namespace currentNsId and changes them to newNsId.
  171. returns: nothing
  172. """
  173. #change ids in aliases
  174. ns = self.nodeset.getElementsByTagName("Alias")
  175. for al in ns:
  176. if al.nodeType == al.ELEMENT_NODE:
  177. if al.hasAttribute("Alias"):
  178. al.firstChild.data = al.firstChild.data.replace("ns=" + str(currentNsId), "ns=" + str(newNsId))
  179. logger.debug("Migrating nodes /w ns index " + str(currentNsId) + " to " + str(newNsId))
  180. for nd in self.containedNodes:
  181. if nd[0].ns == currentNsId:
  182. # In our own document, update any references to this node
  183. for refNd in self.referencedNodes:
  184. if refNd[0].ns == currentNsId and refNd[0] == nd[0]:
  185. refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
  186. refNd[0].ns = newNsId
  187. refNd[0].toString()
  188. nd[1].setAttribute(u'NodeId', nd[1].getAttribute(u'NodeId').replace("ns="+str(currentNsId), "ns="+str(newNsId)))
  189. nd[0].ns = newNsId
  190. nd[0].toString()
  191. class open62541_XMLPreprocessor:
  192. def __init__(self):
  193. self.preProcDocuments = []
  194. def addDocument(self, documentPath):
  195. self.preProcDocuments.append(preProcessDocument(documentPath))
  196. def getPreProcessedFiles(self):
  197. files = []
  198. for doc in self.preProcDocuments:
  199. if (doc.parseOK):
  200. files.append(doc.getTargetXMLName())
  201. return files
  202. def testModelCongruencyAgainstReferences(self, doc, refs):
  203. """ Counts how many of the nodes referenced in refs can be found in the model
  204. doc.
  205. returns: double corresponding to the percentage of hits
  206. """
  207. sspace = len(refs)
  208. if sspace == 0:
  209. return float(0)
  210. found = 0
  211. for ref in refs:
  212. for n in doc.containedNodes:
  213. if str(ref) == str(n[0]):
  214. found = found + 1
  215. break
  216. return float(found)/float(sspace)
  217. def preprocess_assignUniqueNsIds(self):
  218. nsdep = []
  219. docLst = []
  220. # Search for namespace 0('s) - plural possible if user is overwriting NS0 defaults
  221. # Remove them from the list of namespaces, zero does not get demangled
  222. for doc in self.preProcDocuments:
  223. if doc.getNamespaceId() == 0:
  224. docLst.append(doc)
  225. for doc in docLst:
  226. self.preProcDocuments.remove(doc)
  227. # Reassign namespace id's to be in ascending order
  228. nsidx = 1 # next namespace id to assign on collision (first one will be "2")
  229. for doc in self.preProcDocuments:
  230. nsidx = nsidx + 1
  231. nsid = doc.getNamespaceId()
  232. doc.reassignNamespaceId(nsid, nsidx)
  233. docLst.append(doc)
  234. logger.info("Document " + doc.originXML + " is now namespace " + str(nsidx))
  235. self.preProcDocuments = docLst
  236. def getUsedNamespaceArrayNames(self):
  237. """ getUsedNamespaceArrayNames
  238. Returns the XML xmlns:s1 or <URI>[0] of each XML document (if contained/possible)
  239. returns: dict of int:nsId -> string:url
  240. """
  241. nsName = {}
  242. for doc in self.preProcDocuments:
  243. uri = doc.getReferencedNamespaceUri(1)
  244. if uri == None:
  245. uri = "http://modeluri.not/retrievable/from/xml"
  246. nsName[doc.getNamespaceId()] = doc.getReferencedNamespaceUri(1)
  247. return nsName
  248. def preprocess_linkDependantModels(self):
  249. revertToStochastic = [] # (doc, int id), where id was not resolvable using model URIs
  250. # Attemp to identify the model relations by using model URIs in xmlns:sX or <URI> contents
  251. for doc in self.preProcDocuments:
  252. nsid = doc.getNamespaceId()
  253. dependencies = doc.getNamespaceDependencies()
  254. for d in dependencies:
  255. if d != nsid and d != 0:
  256. # Attempt to identify the namespace URI this d referes to...
  257. nsUri = doc.getReferencedNamespaceUri(d) # FIXME: This could actually fail and return ""!
  258. logger.info("Need a namespace referenced as " + str(d) + ". Which hopefully is " + nsUri)
  259. targetDoc = None
  260. for tgt in self.preProcDocuments:
  261. # That model, whose URI is known but its current id is not, will
  262. # refer have referred to itself as "1"
  263. if tgt.getReferencedNamespaceUri(1) == nsUri:
  264. targetDoc = tgt
  265. break
  266. if not targetDoc == None:
  267. # Found the model... relink the references
  268. doc.reassignReferencedNamespaceId(d, targetDoc.getNamespaceId())
  269. continue
  270. else:
  271. revertToStochastic.append((doc, d))
  272. logger.warn("Failed to reliably identify which XML/Model " + os.path.basename(doc.originXML) + " calls ns=" +str(d))
  273. for (doc, d) in revertToStochastic:
  274. logger.warn("Attempting to find stochastic match for target namespace ns=" + str(d) + " of " + os.path.basename(doc.originXML))
  275. # Copy all references to the given namespace
  276. refs = []
  277. matches = [] # list of (match%, targetDoc) to pick from later
  278. for ref in doc.referencedNodes:
  279. if ref[0].ns == d:
  280. refs.append(opcua_node_id_t(str(ref[0])))
  281. for tDoc in self.preProcDocuments:
  282. tDocId = tDoc.getNamespaceId()
  283. # Scenario: If these references did target this documents namespace...
  284. for r in refs:
  285. r.ns = tDocId
  286. r.toString()
  287. # ... how many of them would be found!?
  288. c = self.testModelCongruencyAgainstReferences(tDoc, refs)
  289. if c>0:
  290. matches.append((c, tDoc))
  291. best = (0, None)
  292. for m in matches:
  293. if m[0] > best[0]:
  294. best = m
  295. if best[1] != None:
  296. logger.warn("Best match (%d) for what %s refers to as ns=%s was %s", best[1], os.path.basename(doc.originXML), d, os.path.basename(best[1].originXML))
  297. doc.reassignReferencedNamespaceId(d, best[1].getNamespaceId())
  298. else:
  299. logger.error("Failed to find a match for what " + os.path.basename(doc.originXML) + " refers to as ns=" + str(d))
  300. def preprocessAll(self):
  301. # Gather statistics about the namespaces:
  302. for doc in self.preProcDocuments:
  303. doc.analyze()
  304. # Preprocess step: Remove XML specific Naming scheme ("uax:")
  305. # FIXME: Not implemented
  306. # Check namespace ID multiplicity and reassign IDs if necessary
  307. self.preprocess_assignUniqueNsIds()
  308. self.preprocess_linkDependantModels()
  309. # Prep step: prevent any XML from using namespace 1 (reserved for instances)
  310. # FIXME: Not implemented
  311. # Final: Write modified XML tmp files
  312. for doc in self.preProcDocuments:
  313. doc.finalize()