open62541_XMLPreprocessor.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. #!/usr/bin/env/python
  2. # -*- coding: utf-8 -*-
  3. ###
  4. ### Author: Chris Iatrou (ichrispa@core-vector.net)
  5. ###
  6. ### This program was created for educational purposes and has been
  7. ### contributed to the open62541 project by the author. All licensing
  8. ### terms for this source is inherited by the terms and conditions
  9. ### specified for by the open62541 project (see the projects readme
  10. ### file for more information on the LGPL terms and restrictions).
  11. ###
  12. ### This program is not meant to be used in a production environment. The
  13. ### author is not liable for any complications arising due to the use of
  14. ### this program.
  15. ###
  16. from logger import *
  17. from ua_constants import *
  18. import tempfile
  19. import xml.dom.minidom as dom
  20. import os
  21. import string
  22. from collections import Counter
  23. from ua_namespace import opcua_node_id_t
  24. class preProcessDocument:
  25. originXML = '' # Original XML passed to the preprocessor
  26. targetXML = () # tuple of (fileHandle, fileName)
  27. nodeset = '' # Parsed DOM XML object
  28. parseOK = False;
  29. containedNodes = [] # contains tuples of (opcua_node_id_t, xmlelement)
  30. referencedNodes = [] # contains tuples of (opcua_node_id_t, xmlelement)
  31. namespaceOrder = [] # contains xmlns:sX attributed as tuples (int ns, string name)
  32. namespaceQualifiers = [] # contains all xmlns:XYZ qualifiers that might prefix value aliases (like "<uax:Int32>")
  33. referencedNamesSpaceUris = [] # contains <NamespaceUris> URI elements
  34. def __init__(self, originXML):
  35. self.originXML = originXML
  36. self.targetXML = tempfile.mkstemp(prefix=os.path.basename(originXML)+"_preProcessed-" ,suffix=".xml")
  37. self.parseOK = True
  38. self.containedNodes = []
  39. self.referencedNodes = []
  40. self.namespaceOrder = []
  41. self.referencedNamesSpaceUris = []
  42. self.namespaceQualifiers = []
  43. try:
  44. self.nodeset = dom.parse(originXML)
  45. if len(self.nodeset.getElementsByTagName("UANodeSet")) == 0 or len(self.nodeset.getElementsByTagName("UANodeSet")) > 1:
  46. log(self, "Document " + self.targetXML[1] + " contains no or more then 1 nodeset", LOG_LEVEL_ERROR)
  47. self.parseOK = False
  48. except:
  49. self.parseOK = False
  50. log(self, "Adding new document to be preprocessed " + os.path.basename(originXML) + " as " + self.targetXML[1], LOG_LEVEL_DEBUG)
  51. def clean(self):
  52. #os.close(self.targetXML[0]) Don't -> done to flush() after finalize()
  53. os.remove(self.targetXML[1])
  54. def getTargetXMLName(self):
  55. if (self.parseOK):
  56. return self.targetXML[1]
  57. return None
  58. def extractNamespaceURIs(self):
  59. """ extractNamespaceURIs
  60. minidom gobbles up <NamespaceUris></NamespaceUris> elements, without a decent
  61. way to reliably access this dom2 <uri></uri> elements (only attribute xmlns= are
  62. accessible using minidom). We need them for dereferencing though... This
  63. function attempts to do just that.
  64. returns: Nothing
  65. """
  66. infile = open(self.originXML)
  67. foundURIs = False
  68. nsline = ""
  69. line = infile.readline()
  70. for line in infile:
  71. if "<namespaceuris>" in line.lower():
  72. foundURIs = True
  73. elif "</namespaceuris>" in line.lower():
  74. foundURIs = False
  75. nsline = nsline + line
  76. break
  77. if foundURIs:
  78. nsline = nsline + line
  79. if len(nsline) > 0:
  80. ns = dom.parseString(nsline).getElementsByTagName("NamespaceUris")
  81. for uri in ns[0].childNodes:
  82. if uri.nodeType != uri.ELEMENT_NODE:
  83. continue
  84. self.referencedNamesSpaceUris.append(uri.firstChild.data)
  85. infile.close()
  86. def analyze(self):
  87. """ analyze()
  88. analyze will gather information about the nodes and references contained in a XML File
  89. to facilitate later preprocessing stages that adresss XML dependency issues
  90. returns: No return value
  91. """
  92. nodeIds = []
  93. ns = self.nodeset.getElementsByTagName("UANodeSet")
  94. # We need to find out what the namespace calls itself and other referenced, as numeric id's are pretty
  95. # useless sans linked nodes. There is two information sources...
  96. self.extractNamespaceURIs() # From <URI>...</URI> definitions
  97. for key in ns[0].attributes.keys(): # from xmlns:sX attributes
  98. if "xmlns:" in key: # Any key: we will be removing these qualifiers from Values later
  99. self.namespaceQualifiers.append(key.replace("xmlns:",""))
  100. if "xmlns:s" in key: # get a numeric nsId and modelname/uri
  101. self.namespaceOrder.append((int(key.replace("xmlns:s","")), ns[0].getAttribute(key)))
  102. # Get all nodeIds contained in this XML
  103. for nd in ns[0].childNodes:
  104. if nd.nodeType != nd.ELEMENT_NODE:
  105. continue
  106. if nd.hasAttribute(u'NodeId'):
  107. self.containedNodes.append( (opcua_node_id_t(nd.getAttribute(u'NodeId')), nd) )
  108. refs = nd.getElementsByTagName(u'References')
  109. if len(refs) > 0:
  110. refs = refs[0]
  111. for ref in refs.childNodes:
  112. if ref.nodeType == ref.ELEMENT_NODE:
  113. self.referencedNodes.append( (opcua_node_id_t(ref.firstChild.data), ref) )
  114. log(self, "Nodes: " + str(len(self.containedNodes)) + " References: " + str(len(self.referencedNodes)), LOG_LEVEL_DEBUG)
  115. def getNamespaceId(self):
  116. """ namespaceId()
  117. Counts the namespace IDs in all nodes of this XML and picks the most used
  118. namespace as the numeric identifier of this data model.
  119. returns: Integer ID of the most propable/most used namespace in this XML
  120. """
  121. max = 0;
  122. namespaceIdGuessed = 0;
  123. idDict = {}
  124. for ndid in self.containedNodes:
  125. if not idDict.has_key(ndid[0].ns):
  126. idDict[ndid[0].ns] = 1
  127. else:
  128. idDict[ndid[0].ns] = idDict[ndid[0].ns] + 1
  129. for entry in idDict:
  130. if idDict[entry] > max:
  131. max = idDict[entry]
  132. namespaceIdGuessed = entry
  133. log(self, "XML Contents are propably in namespace " + str(entry) + " (used by " + str(idDict[entry]) + " Nodes)", LOG_LEVEL_DEBUG)
  134. return namespaceIdGuessed
  135. def getReferencedNamespaceUri(self, nsId):
  136. """ getReferencedNamespaceUri
  137. returns an URL that hopefully corresponds to the nsId that was used to reference this model
  138. return: URI string corresponding to nsId
  139. """
  140. # Might be the more reliable method: Get the URI from the xmlns attributes (they have numers)
  141. if len(self.namespaceOrder) > 0:
  142. for el in self.namespaceOrder:
  143. if el[0] == nsId:
  144. return el[1]
  145. # Fallback:
  146. # Some models do not have xmlns:sX attributes, but still <URI>s (usually when they only reference NS0)
  147. if len(self.referencedNamesSpaceUris) > 0 and len(self.referencedNamesSpaceUris) >= nsId-1:
  148. return self.referencedNamesSpaceUris[nsId-1]
  149. #Nope, not found.
  150. return ""
  151. def getNamespaceDependencies(self):
  152. deps = []
  153. for ndid in self.referencedNodes:
  154. if not ndid[0].ns in deps:
  155. deps.append(ndid[0].ns)
  156. return deps
  157. def finalize(self):
  158. outfile = self.targetXML[0]
  159. outline = self.nodeset.toxml()
  160. for qualifier in self.namespaceQualifiers:
  161. rq = qualifier+":"
  162. outline = outline.replace(rq.decode('UTF-8'), "")
  163. os.write(outfile, outline.encode('UTF-8'))
  164. os.close(outfile)
  165. def reassignReferencedNamespaceId(self, currentNsId, newNsId):
  166. """ reassignReferencedNamespaceId
  167. Iterates over all references in this document, find references to currentNsId and changes them to newNsId.
  168. NodeIds themselves are not altered.
  169. returns: nothing
  170. """
  171. for refNd in self.referencedNodes:
  172. if refNd[0].ns == currentNsId:
  173. refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
  174. refNd[0].ns = newNsId
  175. refNd[0].toString()
  176. def reassignNamespaceId(self, currentNsId, newNsId):
  177. """ reassignNamespaceId
  178. Iterates over all nodes in this document, find those in namespace currentNsId and changes them to newNsId.
  179. returns: nothing
  180. """
  181. log(self, "Migrating nodes /w ns index " + str(currentNsId) + " to " + str(newNsId), LOG_LEVEL_DEBUG)
  182. for nd in self.containedNodes:
  183. if nd[0].ns == currentNsId:
  184. # In our own document, update any references to this node
  185. for refNd in self.referencedNodes:
  186. if refNd[0].ns == currentNsId and refNd[0] == nd[0]:
  187. refNd[1].firstChild.data = refNd[1].firstChild.data.replace("ns="+str(currentNsId), "ns="+str(newNsId))
  188. refNd[0].ns = newNsId
  189. refNd[0].toString()
  190. nd[1].setAttribute(u'NodeId', nd[1].getAttribute(u'NodeId').replace("ns="+str(currentNsId), "ns="+str(newNsId)))
  191. nd[0].ns = newNsId
  192. nd[0].toString()
  193. class open62541_XMLPreprocessor:
  194. preProcDocuments = []
  195. def __init__(self):
  196. self.preProcDocuments = []
  197. def addDocument(self, documentPath):
  198. self.preProcDocuments.append(preProcessDocument(documentPath))
  199. def removePreprocessedFiles(self):
  200. for doc in self.preProcDocuments:
  201. doc.clean()
  202. def getPreProcessedFiles(self):
  203. files = []
  204. for doc in self.preProcDocuments:
  205. if (doc.parseOK):
  206. files.append(doc.getTargetXMLName())
  207. return files
  208. def testModelCongruencyAgainstReferences(self, doc, refs):
  209. """ testModelCongruencyAgainstReferences
  210. Counts how many of the nodes referencef in refs can be found in the model
  211. doc.
  212. returns: double corresponding to the percentage of hits
  213. """
  214. sspace = len(refs)
  215. if sspace == 0:
  216. return float(0)
  217. found = 0
  218. for ref in refs:
  219. for n in doc.containedNodes:
  220. if str(ref) == str(n[0]):
  221. print ref, n[0]
  222. found = found + 1
  223. break
  224. return float(found)/float(sspace)
  225. def preprocess_assignUniqueNsIds(self):
  226. nsdep = []
  227. docLst = []
  228. # Search for namespace 0('s) - plural possible if user is overwriting NS0 defaults
  229. # Remove them from the list of namespaces, zero does not get demangled
  230. for doc in self.preProcDocuments:
  231. if doc.getNamespaceId() == 0:
  232. docLst.append(doc)
  233. for doc in docLst:
  234. self.preProcDocuments.remove(doc)
  235. # Reassign namespace id's to be in ascending order
  236. nsidx = 1 # next namespace id to assign on collision (first one will be "2")
  237. for doc in self.preProcDocuments:
  238. nsidx = nsidx + 1
  239. nsid = doc.getNamespaceId()
  240. doc.reassignNamespaceId(nsid, nsidx)
  241. docLst.append(doc)
  242. log(self, "Document " + doc.originXML + " is now namespace " + str(nsidx), LOG_LEVEL_INFO)
  243. self.preProcDocuments = docLst
  244. def getUsedNamespaceArrayNames(self):
  245. """ getUsedNamespaceArrayNames
  246. Returns the XML xmlns:s1 or <URI>[0] of each XML document (if contained/possible)
  247. returns: dict of int:nsId -> string:url
  248. """
  249. nsName = {}
  250. for doc in self.preProcDocuments:
  251. uri = doc.getReferencedNamespaceUri(1)
  252. if uri == None:
  253. uri = "http://modeluri.not/retrievable/from/xml"
  254. nsName[doc.getNamespaceId()] = doc.getReferencedNamespaceUri(1)
  255. return nsName
  256. def preprocess_linkDependantModels(self):
  257. revertToStochastic = [] # (doc, int id), where id was not resolvable using model URIs
  258. # Attemp to identify the model relations by using model URIs in xmlns:sX or <URI> contents
  259. for doc in self.preProcDocuments:
  260. nsid = doc.getNamespaceId()
  261. dependencies = doc.getNamespaceDependencies()
  262. for d in dependencies:
  263. if d != nsid and d != 0:
  264. # Attempt to identify the namespace URI this d referes to...
  265. nsUri = doc.getReferencedNamespaceUri(d) # FIXME: This could actually fail and return ""!
  266. log(self, "Need a namespace referenced as " + str(d) + ". Which hopefully is " + nsUri, LOG_LEVEL_INFO)
  267. targetDoc = None
  268. for tgt in self.preProcDocuments:
  269. # That model, whose URI is known but its current id is not, will
  270. # refer have referred to itself as "1"
  271. if tgt.getReferencedNamespaceUri(1) == nsUri:
  272. targetDoc = tgt
  273. break
  274. if not targetDoc == None:
  275. # Found the model... relink the references
  276. doc.reassignReferencedNamespaceId(d, targetDoc.getNamespaceId())
  277. continue
  278. else:
  279. revertToStochastic.append((doc, d))
  280. log(self, "Failed to reliably identify which XML/Model " + os.path.basename(doc.originXML) + " calls ns=" +str(d), LOG_LEVEL_WARN)
  281. for (doc, d) in revertToStochastic:
  282. log(self, "Attempting to find stochastic match for target namespace ns=" + str(d) + " of " + os.path.basename(doc.originXML), LOG_LEVEL_WARN)
  283. # Copy all references to the given namespace
  284. refs = []
  285. matches = [] # list of (match%, targetDoc) to pick from later
  286. for ref in doc.referencedNodes:
  287. if ref[0].ns == d:
  288. refs.append(opcua_node_id_t(str(ref[0])))
  289. for tDoc in self.preProcDocuments:
  290. tDocId = tDoc.getNamespaceId()
  291. # Scenario: If these references did target this documents namespace...
  292. for r in refs:
  293. r.ns = tDocId
  294. r.toString()
  295. # ... how many of them would be found!?
  296. c = self.testModelCongruencyAgainstReferences(tDoc, refs)
  297. print c
  298. if c>0:
  299. matches.append(c, tDoc)
  300. best = (0, None)
  301. for m in matches:
  302. print m[0]
  303. if m[0] > best[0]:
  304. best = m
  305. if best[1] != None:
  306. log(self, "Best match (" + str(best[1]*100) + "%) for what " + os.path.basename(doc.originXML) + " refers to as ns="+str(d)+" was " + os.path.basename(best[1].originXML), LOG_LEVEL_WARN)
  307. doc.reassignReferencedNamespaceId(d, best[1].getNamespaceId())
  308. else:
  309. log(self, "Failed to find a match for what " + os.path.basename(doc.originXML) + " refers to as ns=" + str(d) ,LOG_LEVEL_ERROR )
  310. def preprocessAll(self):
  311. ##
  312. ## First: Gather statistics about the namespaces:
  313. for doc in self.preProcDocuments:
  314. doc.analyze()
  315. # Preprocess step: Remove XML specific Naming scheme ("uax:")
  316. # FIXME: Not implemented
  317. ##
  318. ## Preprocess step: Check namespace ID multiplicity and reassign IDs if necessary
  319. ##
  320. self.preprocess_assignUniqueNsIds()
  321. self.preprocess_linkDependantModels()
  322. ##
  323. ## Prep step: prevent any XML from using namespace 1 (reserved for instances)
  324. ## FIXME: Not implemented
  325. ##
  326. ## Final: Write modified XML tmp files
  327. for doc in self.preProcDocuments:
  328. doc.finalize()
  329. return True