001/**
002 The contents of this file are subject to the Mozilla Public License Version 1.1
003 (the "License"); you may not use this file except in compliance with the License.
004 You may obtain a copy of the License at http://www.mozilla.org/MPL/
005 Software distributed under the License is distributed on an "AS IS" basis,
006 WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the
007 specific language governing rights and limitations under the License.
008
009 The Initial Developer of the Original Code is University Health Network. Copyright (C)
010 2001.  All Rights Reserved.
011
012 Contributor(s): Jens Kristian Villadsen from Cetrea A/S
013
014 Alternatively, the contents of this file may be used under the terms of the
015 GNU General Public License (the "GPL"), in which case the provisions of the GPL are
016 applicable instead of those above.  If you wish to allow use of your version of this
017 file only under the terms of the GPL and not to allow others to use your version
018 of this file under the MPL, indicate your decision by deleting  the provisions above
019 and replace  them with the notice and other provisions required by the GPL License.
020 If you do not delete the provisions above, a recipient may use your version of
021 this file under either the MPL or the GPL.
022
023 */
024
025
026package ca.uhn.hl7v2.llp;
027
028import java.io.UnsupportedEncodingException;
029import java.nio.charset.Charset;
030import java.util.Arrays;
031
032import ca.uhn.hl7v2.HL7Exception;
033import ca.uhn.hl7v2.parser.EncodingNotSupportedException;
034import ca.uhn.hl7v2.preparser.PreParser;
035import org.slf4j.Logger;
036import org.slf4j.LoggerFactory;
037
038/**
039 * Charset utility class
040 *
041 * @author Jens Kristian Villadsen from Cetrea A/S
042 * @author Christian Ohr
043 */
044public class CharSetUtil {
045
046    private static final Logger LOG = LoggerFactory.getLogger(CharSetUtil.class);
047
048    static Charset checkCharset(String message, Charset defaultCharset) {
049        Charset charset = defaultCharset;
050        try {
051            String[] fields = PreParser.getFields(message, "MSH-18(0)");
052            String hl7CharsetName = stripNonLowAscii(fields[0]);
053            if (hl7CharsetName != null && hl7CharsetName.length() > 0)
054                charset = HL7Charsets.getCharsetForHL7Encoding(hl7CharsetName);
055            LOG.trace("Detected MSH-18 value \"{}\" so using charset {}", hl7CharsetName, charset.displayName());
056        } catch (EncodingNotSupportedException e) {
057            LOG.warn("Invalid or unsupported charset in MSH-18. Defaulting to {}", charset.displayName());
058        } catch (HL7Exception e) {
059            LOG.warn("Failed to parse MSH segment. Defaulting to {}", charset.displayName(), e);
060        }
061        return charset;
062    }
063
064    static Charset checkCharset(byte[] message, Charset defaultCharset) {
065        String guessMessage = BOM.skipBOM(message);
066        return checkCharset(guessMessage, defaultCharset);
067    }
068
069    private static String stripNonLowAscii(String theString) {
070        if (theString == null) return "";
071        StringBuilder b = new StringBuilder();
072
073        for (int i = 0; i < theString.length(); i++) {
074            char next = theString.charAt(i);
075            if (next > 0 && next < 127) {
076                b.append(next);
077            }
078        }
079
080        return b.toString();
081    }
082
083    public static byte[] withoutBOM(byte[] bytes) {
084        BOM bom = BOM.getBOM(bytes);
085        byte[] withoutBOM = new byte[bytes.length - bom.bytes.length];
086        System.arraycopy(bytes, bom.bytes.length, withoutBOM, 0, bytes.length - bom.bytes.length);
087        return withoutBOM;
088    }
089
090    private enum BOM {
091
092        UTF_8(new byte[]{
093                (byte) 0xEF,
094                (byte) 0xBB,
095                (byte) 0xBF}, "UTF-8"),
096        UTF_16_LE(new byte[]{
097                (byte) 0xFF,
098                (byte) 0xFE}, "UTF-16LE"),
099        UTF_16_BE(new byte[]{
100                (byte) 0xFE,
101                (byte) 0xFF}, "UTF-16BE"),
102        UTF_32_LE(new byte[]{
103                (byte) 0xFF,
104                (byte) 0xFE,
105                (byte) 0x00,
106                (byte) 0x00}, "UTF-32LE"),
107        UTF_32_BE(new byte[]{
108                (byte) 0x00,
109                (byte) 0x00,
110                (byte) 0xFE,
111                (byte) 0xFF}, "UTF-32BE"),
112        NONE(new byte[]{},    "US-ASCII");
113
114        private byte[] bytes;
115        private Charset charset;
116
117        BOM(byte[] bytes, String charset) {
118            this.bytes = bytes;
119            this.charset = Charset.forName(charset);
120        }
121
122        public static BOM getBOM(byte[] bytes) {
123            for (BOM bom : BOM.values()) {
124                byte[] bytesToCompare = new byte[bom.bytes.length];
125                System.arraycopy(bytes, 0, bytesToCompare, 0, bom.bytes.length);
126                if (Arrays.equals(bom.bytes, bytesToCompare)) return bom;
127            }
128            return BOM.NONE;
129        }
130
131        public static String skipBOM(byte[] bytes) {
132            try {
133                BOM bom = getBOM(bytes);
134                return new String(bytes, bom.bytes.length, bytes.length - bom.bytes.length, bom.charset.toString());
135            } catch (UnsupportedEncodingException e) {
136                // does not happen
137                return null;
138            }
139        }
140
141
142
143
144    }
145}