1 package ca.uhn.hl7v2.preparser;
2
3 import java.io.IOException;
4 import java.util.ArrayList;
5 import java.util.Collection;
6 import java.util.Iterator;
7 import java.util.List;
8 import java.util.Map;
9 import java.util.Properties;
10 import java.util.SortedMap;
11 import java.util.TreeMap;
12
13 import javax.xml.parsers.ParserConfigurationException;
14 import javax.xml.parsers.SAXParser;
15 import javax.xml.parsers.SAXParserFactory;
16
17 import org.xml.sax.Attributes;
18 import org.xml.sax.InputSource;
19 import org.xml.sax.SAXException;
20 import org.xml.sax.SAXParseException;
21 import org.xml.sax.helpers.DefaultHandler;
22
23 import ca.uhn.hl7v2.HL7Exception;
24
25 public class XML
26 {
27 @SuppressWarnings("serial")
28 protected static class StopParsingException extends SAXException
29 {
30 public StopParsingException()
31 {
32 super("ca.uhn.hl7.....StopParsingException");
33 }
34 }
35
36 /** the SAXParser reports parsing events to an object of this class.
37 We keep track of some parsing state, and the Properties object that
38 we're supposed to write our data to.
39 */
40 static protected class HL7MessageHandler extends DefaultHandler
41 {
42 /* m_props & m_msgMask should be set by the user of this handler before
43 they pass this handler to SAXParser.parse() or whatever */
44
45 /** The data that is found while parsing, and which passes m_msgMask,
46 will be dumped to m_props, as (DatumPath.toString() / text) key/value
47 pairs */
48 public Properties m_props = null;
49
50 /** Specifies what parts of a message should be dumped to m_props.
51 */
52 public Collection<DatumPath> m_msgMask = null;
53
54 /* All other fields are parser state. */
55
56 protected boolean m_startedDocument = false;
57
58 /* m_msgID / m_curPath together keep track of where we are in the document.
59
60 If m_msgID.length() != 0, then we're within the message element. (We're only
61 expecting one message per document.) Then m_msgID will be the name of the
62 message. ("ACK" or whatever).
63
64 m_curPath keeps track of where within the message we are. See notes at
65 DatumPath class definition. If m_curPath.size() != 0, then we must be
66 within a message.
67
68 At any point in the code below:
69
70 if m_msgID.length() == 0,
71 then m_curPath().size() == 0
72
73 if m_curPath.length() != 0
74 then m_msgID.length() != 0
75
76 Note that our DatumPaths count indices starting from 0 (not 1) -- they're
77 only converted to 1-based in the string representations that wind up
78 as m_props keys.
79 */
80 final StringBuffer m_msgID = new StringBuffer();
81 final DatumPathtml#DatumPath">DatumPath m_curPath = new DatumPath();
82
83 /* the location in the document of the last datum we dumped to m_props. */
84 final DatumPathumPath">DatumPath m_lastDumpedPath = new DatumPath();
85
86 /** For handling repeat segments. segmentID (String) -> next repeat idx
87 (Integer). So when we hit a segment ZYX, we'll know how many times we've
88 hit a ZYX before, and set the segmentRepIdx part of m_curPath
89 appropriately. */
90 final SortedMap<String, Integer> m_segmentId2nextRepIdx = new TreeMap<>();
91
92 /* m_depthWithinUselessElement and m_depthWithinUsefulElement
93 reflect what m_msgMask thinks about our location in the document at any
94 given time.
95
96 Both should always be >= -1. Note that both can be >= 0 at the same time
97 -- explained in a minute....
98
99 If m_depthWithinUsefulElement >= 0, this means that we are however deep
100 (in terms of nested elements: 0 => just within) within an area of the
101 message that passes m_msgMask. We should should dump whatever we find
102 there to m_props. As we move around within such an element, we will still
103 update m_curPath appropriately.
104
105 If m_depthWithinUsefulElement >= 0, we are however deep within an element
106 which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1>
107 -- a few other things maybe), or more importantly that we're within an
108 element that otherwise has no hope of having any useful elements within it
109 according to m_msgMask. (eg. m_msgMask says it wants only ZYX segment
110 contents, we're in an <MSH>). So we can safely ignore all content within,
111 and just keep track of how deep we are within this useless element (with
112 m_depthWithinUselessElement, of course.) We don't update m_curPath when
113 m_depthWithinUselessElement >= 0, there's no point and how would we
114 extract information for the DatumPath out of nonsensical element names
115 anyway.
116
117 If they are both >= 0, this means that there we've found some useless
118 stuff (nonsensical element names?) within a known-useful element.
119 */
120 int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1;
121
122 /* With this we keep the text that we've found within a certain element.
123 It's cleared whenever we enter a (sub) element or leave an element. */
124 final StringBuffer m_chars = new StringBuffer(10);
125
126 public HL7MessageHandler()
127 {
128 this.clear();
129 }
130
131 void clear()
132 {
133 // reset the state (m_props & m_msgMask are not state)
134 m_startedDocument = false;
135 m_msgID.delete(0, m_msgID.length());
136 m_curPath.clear();
137 // will always be "less than" (according to DatumPath.numbersLessThan)
138 // any sensible DatumPath:
139 m_lastDumpedPath.clear().add("").add(-42).add(-42).add(-42).add(-42).add(-42);
140 m_segmentId2nextRepIdx.clear();
141 m_depthWithinUsefulElement = -1;
142 m_depthWithinUselessElement = -1;
143 m_chars.delete(0, m_chars.length());
144 }
145
146 public void startDocument() throws SAXException
147 {
148 boolean ok = false;
149 if(!m_startedDocument && (m_props != null)) {
150 m_startedDocument = true;
151 ok = true;
152 }
153
154 if(!ok) {
155 clear();
156 throw new StopParsingException();
157 }
158 }
159
160 public void endDocument() throws SAXException
161 {
162 boolean ok = false;
163 if(m_startedDocument) {
164 this.clear();
165 ok = true;
166 }
167
168 if(!ok) {
169 clear();
170 throw new StopParsingException();
171 }
172 }
173
174 public void startElement(String uri, String localName, String qName,
175 Attributes attributes) throws SAXException
176 {
177 //System.err.println("startelem: " + qName + " curpathsize; " +
178 //m_curPath.size());
179 boolean ok = false;
180 if(m_startedDocument) {
181 // A single unit of text data will be within a single element,
182 // -- none of it will be in sub-elements and there will be no
183 // sub-elements fragmenting the data text.
184 // Right now we're entering a new element: this means that anything
185 // in m_chars will be whitespace (likely), or text left over from,
186 // say, the last field, or text that was somewhere it shouldn't have been.
187 // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>"
188 m_chars.delete(0, m_chars.length());
189
190 if(m_depthWithinUselessElement >= 0) {
191 ++m_depthWithinUselessElement;
192 }
193 else {
194 int oldCurPathSize = m_curPath.size();
195 if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath,
196 m_segmentId2nextRepIdx, m_lastDumpedPath, qName))
197 {
198 if(m_curPath.size() > oldCurPathSize) {
199 // assert (m_depthWithinUselessElement == -1) // m_curPath
200 // should not have grown if we're within a useless element.
201 if(m_depthWithinUsefulElement == -1) {
202 // this new element could match one of the DatumPaths in
203 // m_msgMask -- if that's the case, we've just entered a
204 // useful element.
205 // TODO: functional stylee (a la C++'s std::accumulate) ?
206 boolean curPathStartsWithAMaskElem = false;
207 for(Iterator<DatumPath> maskIt = m_msgMask.iterator();
208 !curPathStartsWithAMaskElem && maskIt.hasNext(); )
209 {
210 curPathStartsWithAMaskElem
211 = m_curPath.startsWith(maskIt.next());
212 }
213
214 if(curPathStartsWithAMaskElem)
215 m_depthWithinUsefulElement = 0;
216 else {
217 // so this element we're entering is not specified by m_msgMask
218 // to be useful -- but might it contains elements that
219 // are?
220 boolean aMaskElemStartsWithCurPath = false;
221 for(Iterator<DatumPath> maskIt = m_msgMask.iterator();
222 !aMaskElemStartsWithCurPath && maskIt.hasNext(); )
223 {
224 aMaskElemStartsWithCurPath
225 = maskIt.next().startsWith(m_curPath);
226 }
227
228 if(!aMaskElemStartsWithCurPath) {
229 // ... nope! useless.
230 m_depthWithinUselessElement = 0;
231 m_curPath.setSize(oldCurPathSize);
232 } // else => ok, carry on, m_depthWithinUse{less,ful}Element
233 // still both -1.
234 }
235 }
236 // else => already within a useful element, don't need to compare
237 // against m_msgMask.
238 }
239 }
240 else
241 m_depthWithinUselessElement = 0;
242 }
243 ok = true;
244 }
245
246 if(!ok) {
247 clear();
248 throw new StopParsingException();
249 }
250 }
251
252 /* doc location == msgID & curPath together.
253 If we've encountered an element called "elementNam", then this tries
254 to determine what it is, based on what we already know about the document.
255 returns true if we can make sense of this new element name given the
256 position we're at (represented by msgID / curPath),
257 false if we can't (which probably means this should be a useless element).
258 returning true doesn't mean that we actually changed msgID or curPath, it
259 might mean that we just passed through a segment group element OK.
260 */
261 protected static boolean tryToGrowDocLocationFromElementName(
262 StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/,
263 Map<String, Integer> segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/,
264 String elementName /*in*/)
265 {
266 boolean ok = false; // ok == can we make sense of this new element?
267 // hmm ... where are we in the document:
268 if((msgID.length() == 0) && (curPath.size() == 0)) {
269 // we're entering a message
270 msgID.replace(0, msgID.length(), elementName);
271 segmentId2nextRepIdx.clear();
272 ok = true;
273 }
274 else if((msgID.length() > 0) && (curPath.size() == 0)) {
275 // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>)
276 // or an actual segment element.
277 if(!(elementName.startsWith("" + msgID + '.'))) {
278 // must be an actual segment.
279 curPath.add(elementName);
280
281 if(segmentId2nextRepIdx.containsKey(elementName))
282 curPath.add(segmentId2nextRepIdx.get(elementName));
283 else
284 curPath.add(Integer.valueOf(0));
285
286 segmentId2nextRepIdx.put(elementName, (Integer) curPath.get(curPath.size() - 1) + 1);
287 }
288 ok = true;
289 }
290 else if((msgID.length() > 0) && (curPath.size() > 0)) {
291 // we're entering a field or a component or a subcomponent.
292 if(curPath.size() == 2) { // we're entering a field element
293 // all fields should start with segment-ID + '.'
294 if(elementName.startsWith("" + curPath.get(0) + '.')) {
295 try {
296 int fieldIdxFromElementName
297 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
298
299 curPath.add(Integer.valueOf(fieldIdxFromElementName));
300
301 // now add the repetition idx to curPath:
302 if((lastDumpedPath.size() >= 4)
303 && ((Integer) lastDumpedPath.get(2)
304 == fieldIdxFromElementName))
305 {
306 // lastDumpedPath has a fieldIdx and a fieldRepIdx.
307 curPath.add(Integer.valueOf((Integer) lastDumpedPath.get(3) + 1));
308 }
309 else
310 curPath.add(Integer.valueOf(0));
311
312 ok = true;
313 } catch(NumberFormatException ignored) {}
314 } // else => this isn't a field -- must be useless.
315 }
316 else if((curPath.size() == 4) || (curPath.size() == 5)) {
317 // we're entering a component or subcomponent element
318 try {
319 int idxFromElementName
320 = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
321 curPath.add(Integer.valueOf(idxFromElementName));
322 ok = true;
323 } catch(NumberFormatException ignored) {}
324 }
325 }
326 return ok;
327 }
328
329 public void endElement(String uri, String localName, String qName)
330 throws SAXException
331 {
332 //System.err.println("endElement: " + qName);
333 boolean ok = false;
334 if(m_startedDocument) {
335 if(m_depthWithinUselessElement >= 0) {
336 --m_depthWithinUselessElement;
337 ok = true;
338 }
339 else {
340 if((m_msgID.length() > 0) && (m_curPath.size() == 0)) {
341 // we're exiting either a message element or a
342 // segment group element.
343 if((""+qName).compareTo(""+m_msgID) == 0)
344 m_msgID.delete(0, m_msgID.length()); // => exiting message element
345 // else => segment group element -- do nothing.
346
347 ok = true;
348 }
349 else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) {
350 tryToDumpDataToProps();
351
352 if(m_curPath.size() == 2) {
353 // exiting a segment element
354 m_curPath.setSize(0);
355 ok = true;
356 }
357 else if(m_curPath.size() == 4) {
358 // exiting a field element
359 m_curPath.setSize(2);
360 ok = true;
361 }
362 else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) {
363 // exiting a component or a subcomponent
364 m_curPath.setSize(m_curPath.size() - 1);
365 ok = true;
366 }
367 }
368
369 if(m_depthWithinUsefulElement >= 0)
370 --m_depthWithinUsefulElement;
371 }
372 }
373
374 if(!ok) {
375 clear();
376 throw new StopParsingException();
377 }
378 }
379
380 /** try to dump whatever we've got in m_chars to m_props,
381 with a key of m_curPath.toString().
382 */
383 protected void tryToDumpDataToProps()
384 {
385 if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) {
386 /* m_curPath.toString() will be the property key whose value will be
387 m_chars.
388
389 This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9>
390 <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something
391 like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element. (note: internal
392 DatumPath elements are 0-indexed, string representations of DatumPaths and
393 the XML text is 1-indexed.) So in m_props the key for "P" would have been
394 "ZYX[0]-9[0]-1-1". (the last "-1" is a default that got added by
395 toString()).
396
397 Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0,
398 9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when
399 exiting the ZYX.9 element, we might have written that whitespace to m_props
400 with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1":
401 the same as the key for the "P" ... clobbering "P" in m_props with
402 whitespace.
403
404 But since we know that HL7 fields / components / etc are always in order
405 (numerically), we can count on m_lastDumpedPath and use
406 DatumPath.numbersLessThan to avoid the clobbering.
407 */
408 if((!m_lastDumpedPath.get(0).equals(m_curPath.get(0))) || (m_lastDumpedPath.numbersLessThan(m_curPath)))
409 {
410 if(m_depthWithinUsefulElement >= 0) {
411 m_props.setProperty(m_curPath.toString(), m_chars.toString());
412 m_lastDumpedPath.copy(m_curPath);
413 m_chars.delete(0, m_chars.length());
414 }
415 }
416 }
417 }
418
419 public void characters(char[] chars, int start, int length)
420 {
421 // note that a contiguous run of characters in the document
422 // might get reported to us in several chunks.
423 // (In the order that the text appears in the document,
424 // non-overlapping and with no gaps between chunks.)
425 // An entity like & will reach us as an actual & character.
426
427 if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) {
428 m_chars.append(chars, start, length);
429 }
430 }
431
432 public void ignoreableWhitespace(char []chars, int start, int length)
433 {
434 // it's unclear which whitespace is considered ignorable for us.
435 // what the heck, add it to m_chars.
436 characters(chars, start, length);
437 }
438
439 public void error(SAXParseException e)
440 {
441 // TODO: remove.
442 System.err.println("Error in " + getClass() + ": " + e);
443 }
444
445 public void fatalError(SAXParseException e) throws SAXException
446 {
447 throw e;
448 }
449 }
450
451 /** parse message according to our HL7 XML handler, and dump the data found
452 to props.
453
454 returns true if we parsed ok, which means well-formed XML, and
455 that's about it. We just barely check against HL7 structure, and ignore any
456 elements / text that is unexpected (that is, impossible in any HL7 message:
457 independant of any message / segment definitions).
458
459 "message" should be an XML document with one top-level element -- that being
460 the message. (<ACK> or whatever). We're only expecting one message to be in
461 "message".
462
463 props can be null if you don't want the data (we still parse). The message
464 data found in message (that passes msgMask) will be added to props as key /
465 value pairs with the key a toString() of the appropriate DatumPath for the
466 location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and
467 the value the corresponding text. So, after calling parseMessage
468 successfully, if you wanted to retrieve the message data from props you
469 might call something like
470 props.getProperty((new DatumPath()).add("MSH").add(1).toString())
471 and that would return a String with "|", probably.
472
473 Note that this package facilitates the extraction of message data in a way
474 independent of message version (i.e. components and whatever getting added):
475
476 With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>",
477 "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at
478 DatumPath.toString())
479
480 So if you, coding for a future version of the FOO message but
481 recieving old-version message data, tried
482 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString())
483 with the message above (that is, trying to extract a repetition and
484 component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to
485 "fieldy-field-field" in the resulting props.
486
487 If the message was
488 "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>"
489 and you, coding for an old version of this FOO message but recieving
490 new-version FOO message data, tried
491 props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString())
492 you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting
493 props.
494
495 msgMask lets you specify which parts of the message you want dumped to props.
496 Passing in null gets you everything. Otherwise, msgMask's elements should
497 all be DatumPaths (! => ClassCastException), and a particular part of the
498 message will be dumped to props only if it's location, as represented by a
499 DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of
500 msgMask. So if one element of msgMask was a (new DatumPath()).add(new
501 String("ZYX")), then everything in all ZYX segment would get dumped to props.
502 A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first
503 repetitions of same (if there is one) dumped to props. etc. etc. Note that
504 a DatumPath of size() == 0 in msgMask will get you everything, no matter what
505 the other elements of msgMask are, because all DatumPaths startsWith the
506 zero-length DatumPath.
507
508 Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they
509 aren't addressed in msgMask or in the output in props -- basically any
510 element tags at the level immediately inside the message element, and having
511 a name that starts with the message element name + '.', is ignored (meaning
512 it's contents are dealt with the same as if the start and end tags' just
513 wasn't there.)
514 */
515 public static boolean parseMessage(Properties props, String message,
516 Collection<DatumPath> msgMask) throws HL7Exception
517 {
518 boolean ret;
519 try {
520 SAXParserFactory factory = SAXParserFactory.newInstance();
521 SAXParser parser = factory.newSAXParser();
522
523 InputSource inSrc = new InputSource(new java.io.StringReader(message));
524
525 HL7MessageHandler handler = new HL7MessageHandler();
526 handler.m_props = (props != null
527 ? props : new Properties()); // it's expecting a props.
528
529 if(msgMask != null)
530 handler.m_msgMask = msgMask;
531 else {
532 handler.m_msgMask = new ArrayList<>();
533 handler.m_msgMask.add(new DatumPath());
534 }
535
536 parser.parse(inSrc, handler);
537 ret = true;
538 } catch (IOException | StopParsingException e) {
539 throw new HL7Exception(e);
540 } catch (ParserConfigurationException | SAXException e) {
541 throw new HL7Exception(e);
542 }
543
544 return true;
545 }
546
547 public static void main(String[] args)
548 {
549 if(args.length >= 1) {
550 Properties props = new Properties();
551 List<DatumPath> msgMask = new ArrayList<>();
552 msgMask.add(new DatumPath().add("MSH").add(0).add(9));
553 //msgMask.add(new DatumPath());
554 boolean parseret;
555 try {
556 parseret = XML.parseMessage(props, args[0], msgMask);
557 System.err.println("parseMessage returned " + parseret);
558 } catch (HL7Exception e) {
559 e.printStackTrace();
560 }
561 props.list(System.err);
562 }
563 }
564 }
565