001package ca.uhn.hl7v2.preparser;
002
003import java.io.IOException;
004import java.util.ArrayList;
005import java.util.Collection;
006import java.util.Iterator;
007import java.util.List;
008import java.util.Map;
009import java.util.Properties;
010import java.util.SortedMap;
011import java.util.TreeMap;
012
013import javax.xml.parsers.ParserConfigurationException;
014import javax.xml.parsers.SAXParser;
015import javax.xml.parsers.SAXParserFactory;
016
017import org.xml.sax.Attributes;
018import org.xml.sax.InputSource;
019import org.xml.sax.SAXException;
020import org.xml.sax.SAXParseException;
021import org.xml.sax.helpers.DefaultHandler;
022
023import ca.uhn.hl7v2.HL7Exception;
024
025public class XML
026{
027        @SuppressWarnings("serial")
028        protected static class StopParsingException extends SAXException
029        {
030                public StopParsingException() 
031                {
032                        super("ca.uhn.hl7.....StopParsingException");
033                }
034        }
035
036        /** the SAXParser reports parsing events to an object of this class.
037        We keep track of some parsing state, and the Properties object that 
038        we're supposed to write our data to.
039        */
040        static protected class HL7MessageHandler extends DefaultHandler 
041        {
042                /* m_props & m_msgMask should be set by the user of this handler before
043                they pass this handler to SAXParser.parse() or whatever */
044
045                /** The data that is found while parsing, and which passes m_msgMask, 
046                will be dumped to m_props, as (DatumPath.toString() / text) key/value
047                pairs */
048                public Properties m_props = null;
049
050                /** Specifies what parts of a message should be dumped to m_props. 
051                */
052                public Collection<DatumPath> m_msgMask = null;
053
054                /* All other fields are parser state. */
055
056                protected boolean m_startedDocument = false;
057
058                /* m_msgID / m_curPath together keep track of where we are in the document.
059
060                If m_msgID.length() != 0, then we're within the message element.  (We're only
061                expecting one message per document.)  Then m_msgID will be the name of the 
062                message.  ("ACK" or whatever).  
063
064                m_curPath keeps track of where within the message we are.  See notes at 
065                DatumPath class definition.  If m_curPath.size() != 0, then we must be 
066                within a message.
067
068                At any point in the code below: 
069
070                if m_msgID.length() == 0, 
071                        then m_curPath().size() == 0
072
073                if m_curPath.length()  != 0
074                        then m_msgID.length() != 0
075                
076                Note that our DatumPaths count indices starting from 0 (not 1) -- they're 
077                only converted to 1-based in the string representations that wind up 
078                as m_props keys.
079                */
080                StringBuffer m_msgID = new StringBuffer();
081                DatumPath m_curPath = new DatumPath();
082
083                /* the location in the document of the last datum we dumped to m_props. */
084                DatumPath m_lastDumpedPath = new DatumPath();
085
086                /** For handling repeat segments.   segmentID (String) -> next repeat idx
087                (Integer).  So when we hit a segment ZYX, we'll know how many times we've
088                hit a ZYX before, and set the segmentRepIdx part of m_curPath
089                appropriately. */
090                SortedMap<String, Integer> m_segmentId2nextRepIdx = new TreeMap<String, Integer>();
091
092                /* m_depthWithinUselessElement and m_depthWithinUsefulElement 
093                reflect what m_msgMask thinks about our location in the document at any
094                given time.  
095
096                Both should always be >= -1.  Note that both can be >= 0 at the same time
097                -- explained in a minute....
098
099                If m_depthWithinUsefulElement >= 0, this means that we are however deep
100                (in terms of nested elements: 0 => just within) within an area of the
101                message that passes m_msgMask.  We should should dump whatever we find
102                there to m_props.  As we move around within such an element, we will still
103                update m_curPath appropriately.
104
105                If m_depthWithinUsefulElement >= 0, we are however deep within an element
106                which either made no sense (eg. <ZZZ.1> where we were expecting a <ZYX.1>
107                -- a few other things maybe), or more importantly that we're within an
108                element that otherwise has no hope of having any useful elements within it
109                according to m_msgMask.  (eg. m_msgMask says it wants only ZYX segment
110                contents, we're in an <MSH>).  So we can safely ignore all content within,
111                and just keep track of how deep we are within this useless element (with
112                m_depthWithinUselessElement, of course.)  We don't update m_curPath when
113                m_depthWithinUselessElement >= 0, there's no point and how would we
114                extract information for the DatumPath out of nonsensical element names
115                anyway.
116
117                If they are both >= 0, this means that there we've found some useless
118                stuff (nonsensical element names?) within a known-useful element.
119                */
120                int m_depthWithinUsefulElement = -1, m_depthWithinUselessElement = -1;
121
122                /* With this we keep the text that we've found within a certain element.
123                It's cleared whenever we enter a (sub) element or leave an element. */
124                StringBuffer m_chars = new StringBuffer(10);
125
126                public HL7MessageHandler()
127                {
128                        this.clear();
129                }
130
131                void clear()
132                {
133                        // reset the state (m_props & m_msgMask are not state)
134                        m_startedDocument = false;
135                        m_msgID.delete(0, m_msgID.length());
136                        m_curPath.clear();
137                        // will always be "less than" (according to DatumPath.numbersLessThan)
138                        // any sensible DatumPath: 
139                        m_lastDumpedPath.clear().add(new String()).add(-42).add(-42).add(-42).add(-42).add(-42);
140                        m_segmentId2nextRepIdx.clear();
141                        m_depthWithinUsefulElement = -1;
142                        m_depthWithinUselessElement = -1;
143                        m_chars.delete(0, m_chars.length());
144                }
145
146                public void startDocument() throws SAXException
147                {
148                        boolean ok = false;
149                        if(!m_startedDocument && (m_props != null)) {
150                                m_startedDocument = true;
151                                ok = true;
152                        }
153
154                        if(!ok) {
155                                clear();
156                                throw new StopParsingException();
157                        }
158                }
159
160                public void endDocument() throws SAXException
161                {
162                        boolean ok = false;
163                        if(m_startedDocument) {
164                                this.clear();
165                                ok = true;
166                        }
167
168                        if(!ok) {
169                                clear();
170                                throw new StopParsingException();
171                        }
172                }
173
174                public void startElement(String uri, String localName, String qName, 
175                                Attributes attributes) throws SAXException 
176                {
177                        //System.err.println("startelem: " + qName + " curpathsize; " +
178                        //m_curPath.size());
179                        boolean ok = false;
180                        if(m_startedDocument) {
181                                // A single unit of text data will be within a single element, 
182                                // -- none of it will be in sub-elements and there will be no 
183                                // sub-elements fragmenting the data text.
184                                // Right now we're entering a new element: this means that anything
185                                // in m_chars will be whitespace (likely), or text left over from, 
186                                // say, the last field, or text that was somewhere it shouldn't have been.
187                                // (ex. "<ZYX.9> shouldn't be here <PT.1> P </PT.1> </ZYX.9>"
188                                m_chars.delete(0, m_chars.length());
189
190                                if(m_depthWithinUselessElement >= 0) {
191                                        ++m_depthWithinUselessElement;
192                                }
193                                else {
194                                        int oldCurPathSize = m_curPath.size();
195                                        if(tryToGrowDocLocationFromElementName(m_msgID, m_curPath, 
196                                                m_segmentId2nextRepIdx, m_lastDumpedPath, qName)) 
197                                        {
198                                                if(m_curPath.size() > oldCurPathSize) {
199                                                        // assert (m_depthWithinUselessElement == -1) // m_curPath
200                                                        // should not have grown if we're within a useless element.
201                                                        if(m_depthWithinUsefulElement == -1) {
202                                                                // this new element could match one of the DatumPaths in
203                                                                // m_msgMask -- if that's the case, we've just entered a
204                                                                // useful element.
205                                                                // TODO: functional stylee (a la C++'s std::accumulate) ? 
206                                                                boolean curPathStartsWithAMaskElem = false;
207                                                                for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
208                                                                        !curPathStartsWithAMaskElem && maskIt.hasNext(); )
209                                                                {
210                                                                        curPathStartsWithAMaskElem 
211                                                                                = m_curPath.startsWith(maskIt.next());
212                                                                }
213
214                                                                if(curPathStartsWithAMaskElem) 
215                                                                        m_depthWithinUsefulElement = 0;
216                                                                else {
217                                                                        // so this element we're entering is not specified by m_msgMask
218                                                                        // to be useful -- but might it contains elements that
219                                                                        // are?
220                                                                        boolean aMaskElemStartsWithCurPath = false;
221                                                                        for(Iterator<DatumPath> maskIt = m_msgMask.iterator(); 
222                                                                                !aMaskElemStartsWithCurPath && maskIt.hasNext(); )
223                                                                        {
224                                                                                aMaskElemStartsWithCurPath 
225                                                                                        = maskIt.next().startsWith(m_curPath);
226                                                                        }
227
228                                                                        if(!aMaskElemStartsWithCurPath) {
229                                                                                // ... nope!  useless.
230                                                                                m_depthWithinUselessElement = 0;
231                                                                                m_curPath.setSize(oldCurPathSize);
232                                                                        } // else => ok, carry on, m_depthWithinUse{less,ful}Element
233                                                                        // still both -1.
234                                                                }
235                                                        }
236                                                        // else => already within a useful element, don't need to compare 
237                                                        // against m_msgMask.
238                                                }
239                                        }
240                                        else
241                                                m_depthWithinUselessElement = 0;
242                                }
243                                ok = true;
244                        }
245
246                        if(!ok) {
247                                clear();
248                                throw new StopParsingException();
249                        }
250                }
251
252                /* doc location == msgID & curPath together.  
253                If we've encountered an element called "elementNam", then this tries 
254                to determine what it is, based on what we already know about the document.
255                returns true if we can make sense of this new element name given the
256                position we're at (represented by msgID / curPath), 
257                false if we can't (which probably means this should be a useless element). 
258                returning true doesn't mean that we actually changed msgID or curPath, it
259                might mean that we just passed through a segment group element OK.
260                */
261                protected static boolean tryToGrowDocLocationFromElementName(
262                        StringBuffer msgID /*in/out*/, DatumPath curPath /*in/out*/, 
263                        Map<String, Integer> segmentId2nextRepIdx /*in/out*/, DatumPath lastDumpedPath /*in*/, 
264                        String elementName /*in*/)
265                {
266                        boolean ok = false; // ok == can we make sense of this new element?
267                        // hmm ... where are we in the document: 
268                        if((msgID.length() == 0) && (curPath.size() == 0)) {
269                                // we're entering a message
270                                msgID.replace(0, msgID.length(), elementName);
271                                segmentId2nextRepIdx.clear();
272                                ok = true;
273                        }
274                        else if((msgID.length() > 0) && (curPath.size() == 0)) {
275                                // we're entering either a segment-group element (eg. <ADT_A01.PROCEDURE>)
276                                // or an actual segment element.
277                                if(!(elementName.startsWith("" + msgID + '.'))) {
278                                        // must be an actual segment.
279                                        curPath.add(elementName);
280
281                                        if(segmentId2nextRepIdx.containsKey(elementName)) 
282                                                curPath.add(segmentId2nextRepIdx.get(elementName));
283                                        else
284                                                curPath.add(new Integer(0));
285
286                                        segmentId2nextRepIdx.put(elementName, ((Integer)curPath.get(curPath.size()-1)).intValue() + 1);
287                                }
288                                ok = true;
289                        }
290                        else if((msgID.length() > 0) && (curPath.size() > 0)) {
291                                // we're entering a field or a component or a subcomponent.
292                                if(curPath.size() == 2) { // we're entering a field element
293                                        // all fields should start with segment-ID + '.' 
294                                        if(elementName.startsWith("" + curPath.get(0) + '.')) {
295                                                try {
296                                                        int fieldIdxFromElementName 
297                                                                = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
298
299                                                        curPath.add(new Integer(fieldIdxFromElementName));
300
301                                                        // now add the repetition idx to curPath: 
302                                                        if((lastDumpedPath.size() >= 4) 
303                                                                && (((Integer)lastDumpedPath.get(2)).intValue() 
304                                                                        == fieldIdxFromElementName))
305                                                        {
306                                                                // lastDumpedPath has a fieldIdx and a fieldRepIdx.
307                                                                curPath.add(new Integer(((Integer)lastDumpedPath.get(3)).intValue() + 1));
308                                                        }
309                                                        else
310                                                                curPath.add(new Integer(0));
311
312                                                        ok = true;
313                                                } catch(NumberFormatException e) {}
314                                        } // else => this isn't a field -- must be useless.
315                                }
316                                else if((curPath.size() == 4) || (curPath.size() == 5)) {
317                                        // we're entering a component or subcomponent element
318                                        try {
319                                                int idxFromElementName 
320                                                        = Integer.parseInt(elementName.substring(elementName.indexOf('.') + 1));
321                                                curPath.add(new Integer(idxFromElementName));
322                                                ok = true;
323                                        } catch(NumberFormatException e) {}
324                                }
325                        }
326                        return ok;
327                }
328
329                public void endElement(String uri, String localName, String qName) 
330                        throws SAXException 
331                {
332                        //System.err.println("endElement: " + qName);
333                        boolean ok = false;
334                        if(m_startedDocument) {
335                                if(m_depthWithinUselessElement >= 0) {
336                                        --m_depthWithinUselessElement;
337                                        ok = true;
338                                }
339                                else {
340                                        if((m_msgID.length() > 0) && (m_curPath.size() == 0)) {
341                                                // we're exiting either a message element or a 
342                                                // segment group element.
343                                                if((""+qName).compareTo(""+m_msgID) == 0)
344                                                        m_msgID.delete(0, m_msgID.length()); // => exiting message element
345                                                // else => segment group element -- do nothing.
346
347                                                ok = true;
348                                        }
349                                        else if((m_msgID.length() > 0) && (m_curPath.size() > 0)) {
350                                                tryToDumpDataToProps();
351
352                                                if(m_curPath.size() == 2) {
353                                                        // exiting a segment element
354                                                        m_curPath.setSize(0);
355                                                        ok = true;
356                                                }
357                                                else if(m_curPath.size() == 4) {
358                                                        // exiting a field element 
359                                                        m_curPath.setSize(2);
360                                                        ok = true;
361                                                }
362                                                else if((m_curPath.size() == 5) || (m_curPath.size() == 6)) {
363                                                        // exiting a component or a subcomponent
364                                                        m_curPath.setSize(m_curPath.size() - 1);
365                                                        ok = true;
366                                                }
367                                        }
368
369                                        if(m_depthWithinUsefulElement >= 0) 
370                                                --m_depthWithinUsefulElement;
371                                }
372                        }
373
374                        if(!ok) {
375                                clear();
376                                throw new StopParsingException();
377                        }
378                }
379
380                /** try to dump whatever we've got in m_chars to m_props, 
381                with a key of m_curPath.toString(). 
382                */
383                protected void tryToDumpDataToProps()
384                {
385                        if((m_curPath.size() >= 2) && (m_depthWithinUselessElement == -1)) {
386                                /* m_curPath.toString() will be the property key whose value will be
387                                m_chars.
388
389                                This is (part of) what m_lastDumpedPath is for: With, for example "<ZYX.9>
390                                <PT.1>P</PT.1> </ZYX.9>" we might have had a m_curPath containing something
391                                like [ZYX, 0, 9, 0, 0] when we exited the PT.1 element.  (note: internal
392                                DatumPath elements are 0-indexed, string representations of DatumPaths and
393                                the XML text is 1-indexed.)  So in m_props the key for "P" would have been
394                                "ZYX[0]-9[0]-1-1".  (the last "-1" is a default that got added by
395                                toString()).
396                                
397                                Then we would have exited the PT.3 element, changed m_curPath to [ZYX, 0,
398                                9, 0], picked up the whitespace between </PT.3> and </ZYX.9>, and when
399                                exiting the ZYX.9 element, we might have written that whitespace to m_props
400                                with a key of the toString() of [ZYX, 0, 9, 0]; that is, "ZYX[0]-9[0]-1-1":
401                                the same as the key for the "P" ... clobbering "P" in m_props with
402                                whitespace.
403
404                                But since we know that HL7 fields / components / etc are always in order
405                                (numerically), we can count on m_lastDumpedPath and use
406                                DatumPath.numbersLessThan to avoid the clobbering.
407                                */
408                                if((m_lastDumpedPath.get(0).equals(m_curPath.get(0))) 
409                                                ? (m_lastDumpedPath.numbersLessThan(m_curPath)) 
410                                                : true)
411                                {
412                                        if(m_depthWithinUsefulElement >= 0) {
413                                                m_props.setProperty(m_curPath.toString(), m_chars.toString());
414                                                m_lastDumpedPath.copy(m_curPath);
415                                                m_chars.delete(0, m_chars.length());
416                                        }
417                                }
418                        }
419                }
420
421                public void characters(char[] chars, int start, int length)
422                {
423                        // note that a contiguous run of characters in the document 
424                        // might get reported to us in several chunks. 
425                        // (In the order that the text appears in the document, 
426                        // non-overlapping and with no gaps between chunks.) 
427                        // An entity like &amp; will reach us as an actual & character.
428                        
429                        if((m_msgID.length() > 0) && (m_curPath.size() >= 4)) {
430                                m_chars.append(chars, start, length);
431                        }
432                }
433
434                public void ignoreableWhitespace(char []chars, int start, int length)
435                {
436                        // it's unclear which whitespace is considered ignorable for us.  
437                        // what the heck, add it to m_chars. 
438                        characters(chars, start, length);
439                }
440
441                public void error(SAXParseException e)
442                {
443                        // TODO: remove.
444                        System.err.println("Error in " + getClass() + ": " + e);
445                }
446
447                public void fatalError(SAXParseException e) throws SAXException 
448                {
449                        throw e;
450                }
451        }
452
453        /** parse message according to our HL7 XML handler, and dump the data found
454        to props.  
455        
456        returns true if we parsed ok, which means well-formed XML, and
457        that's about it.  We just barely check against HL7 structure, and ignore any
458        elements / text that is unexpected (that is, impossible in any HL7 message:
459        independant of any message / segment definitions).
460
461        "message" should be an XML document with one top-level element -- that being
462        the message.  (<ACK> or whatever).  We're only expecting one message to be in
463        "message".
464
465        props can be null if you don't want the data (we still parse).  The message
466        data found in message (that passes msgMask) will be added to props as key /
467        value pairs with the key a toString() of the appropriate DatumPath for the
468        location where the data is found (i.e. in the ZYX[a]-b[c]-d-e style), and
469        the value the corresponding text.  So, after calling parseMessage
470        successfully, if you wanted to retrieve the message data from props you
471        might call something like 
472        props.getProperty((new DatumPath()).add("MSH").add(1).toString())
473        and that would return a String with "|", probably.
474
475        Note that this package facilitates the extraction of message data in a way
476        independent of message version (i.e. components and whatever getting added):
477
478        With a message of "<FOO><ZYX><ZYX.42>fieldy-field-field</ZYX.42></ZYX></FOO>",
479        "ZYX[0]-1[0]-1-1" will be the key that ends up in props (see notes at
480        DatumPath.toString())
481
482        So if you, coding for a future version of the FOO message but
483        recieving old-version message data, tried
484        props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).add(0).add(1).toString()) 
485        with the message above (that is, trying to extract a repetition and
486        component that aren't there), you would get "ZYX[0]-42[0]-1-1" mapping to 
487        "fieldy-field-field" in the resulting props.  
488
489        If the message was
490        "<FOO><ZYX><ZYX.42><ARG.1>component data</ARG.1></ZYX.42></ZYX></FOO>"
491        and you, coding for an old version of this FOO message but recieving
492        new-version FOO message data, tried 
493        props.getProperty((new DatumPath()).add("ZYX").add(0).add(42).toString())
494        you would get "ZYX[0]-42[0]-1-1" mapping to "component data" in the resulting 
495        props.
496
497        msgMask lets you specify which parts of the message you want dumped to props.
498        Passing in null gets you everything.  Otherwise, msgMask's elements should
499        all be DatumPaths (! => ClassCastException), and a particular part of the
500        message will be dumped to props only if it's location, as represented by a
501        DatumPath, startsWith (as in DatumPath.startsWith()) at least one element of
502        msgMask.  So if one element of msgMask was a (new DatumPath()).add(new
503        String("ZYX")), then everything in all ZYX segment would get dumped to props.
504        A (new DatumPath()).add(new String("ZYX")).add(1) would get only the first
505        repetitions of same (if there is one) dumped to props.  etc. etc.  Note that
506        a DatumPath of size() == 0 in msgMask will get you everything, no matter what
507        the other elements of msgMask are, because all DatumPaths startsWith the
508        zero-length DatumPath.
509
510        Segment group elements (eg. ADT_A01.PROCEDURE) are handled fine, but they
511        aren't addressed in msgMask or in the output in props -- basically any
512        element tags at the level immediately inside the message element, and having
513        a name that starts with the message element name + '.', is ignored (meaning
514        it's contents are dealt with the same as if the start and end tags' just 
515        wasn't there.)
516        */
517        public static boolean parseMessage(Properties props, String message, 
518                        Collection<DatumPath> msgMask) throws HL7Exception
519        {
520                boolean ret = false;
521                try {
522                        SAXParserFactory factory = SAXParserFactory.newInstance();
523                        SAXParser parser = factory.newSAXParser();
524
525                        InputSource inSrc = new InputSource(new java.io.StringReader(message));
526
527                        HL7MessageHandler handler = new HL7MessageHandler();
528                        handler.m_props = (props != null 
529                                ? props : new Properties()); // it's expecting a props.
530
531                        if(msgMask != null)
532                                handler.m_msgMask = msgMask;
533                        else {
534                                handler.m_msgMask = new ArrayList<DatumPath>();
535                                handler.m_msgMask.add(new DatumPath());
536                        }
537
538                        parser.parse(inSrc, handler);
539                        ret = true;
540        } catch (ParserConfigurationException e) {
541            throw new HL7Exception(e);
542        } catch (IOException e) {
543            throw new HL7Exception(e);
544        } catch (StopParsingException e) {
545            throw new HL7Exception(e);
546        } catch (SAXException e) {
547            throw new HL7Exception(e);
548        }
549
550                return ret;
551        }
552
553        public static void main(String args[]) 
554        {
555                if(args.length >= 1) {
556                        Properties props = new Properties();
557                        List<DatumPath> msgMask = new ArrayList<DatumPath>();
558                        msgMask.add(new DatumPath().add("MSH").add(0).add(9));
559                        //msgMask.add(new DatumPath());
560                        boolean parseret;
561            try {
562                parseret = XML.parseMessage(props, args[0], msgMask);
563                System.err.println("parseMessage returned " + parseret);
564            } catch (HL7Exception e) {
565                e.printStackTrace();
566            }
567                        props.list(System.err);
568                }
569        }
570}
571