001    /**  
002     * File:    UUCoder.java   
003     * Author:  Bryan Parno 
004     * Email:   parno@fas.harvard.edu
005     * Created: 5/16/03
006     * Purpose: Converts binary data into ASCII representation
007     * Updated 12/9/03 to convert short byte arrays
008     */
009    package edu.harvard.deas.hyperenc.util;
010    
011    
012    /**
013     * Encodes and decodes binary data into ASCII using a similar, but not
014     * identical, format to UUEncoding.
015     * <p>
016     * Bytes are translated into ASCII characters in the same way as UUEncoding:
017     * groups of three 8-bit bytes are realigned into sets of four 6-bit groups.
018     * Each 6-bit group is then converted to an ASCII character by adding 32, with
019     * the exception of 00: a zero is represented by the backtick (`, ASCII 96).
020     * Thus the valid characters, spanning from ASCII 33 to ASCII 96, are:<br>
021     * <blockquote>
022     * 
023     * <pre>
024     * !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`
025     * </pre>
026     * </blockquote>
027     * 
028     * If the message to be encoded is not a multiple of 3 bytes, it is padded with
029     * zeros so that it is.
030     * <p>
031     * Messages are not delineated as in standard UUEncoding. The first line of a
032     * message is a decimal number indicating how many bytes are encoded by the
033     * message. This is followed by the encoded data on as many lines as needed.
034     * Unlike standard UUEncoding, the length of each line is not indicated in any
035     * way. The only requirement is that each line contain a multiple of 4
036     * characters, not including the trailing newline character(s). Finally, the
037     * message ends with a line consisting of only the token "<code>TERMEND</code>".
038     */
039    // XXX it would be better if this did standard UUEncoding...
040    public class UUCoder
041    {
042      /**
043       * Number of characters to place on each line when encoding. Must be a
044       * multiple of 4.
045       */
046            private final static int CHARS_PER_LINE = 48;
047            
048            /**
049             * Token that indicates the end of a message.
050             */
051            public final static String TERMINATOR = "TERMEND";
052    
053      /**
054       * @return a String containing all the legal characters that may appear in an
055       *         encoding (excluding newline characters)
056       */
057            public static String getLegalChars() {
058              String s = "";
059              for (int i = 33; i < 97; i++) {
060                s += (char)i;
061              }
062              return s;
063            }
064    
065      /**
066       * Encodes the given data. The format of the encoding is described in the
067       * class description.
068       * 
069       * @param data
070       *        the data to be encoded
071       * @return a string containing the encoding of <code>data</code>. The string
072       *         contains only ASCII characters from 33 to 96.
073       */
074            public static String encode(byte[] data) {
075                    int numChars = 0;
076                    String result = new String("");
077                    
078                    // Place the unencoded message length (in bytes) on the first line
079                    result = result + data.length + "\n";
080    
081                    // Encode in units of 3 bytes = 24 bits = 4 sets of 6 bits 
082                    for (int i = 0; i < (int) Math.ceil(data.length / 3.0); i++) {
083                            byte a=0,b=0,c=0;
084    
085                            a = data[i * 3];
086                      // Pad with zero bytes so the input length is a multiple of 3
087                            if (i*3 + 1 >= data.length) {
088                                    b = (byte) 0x00;
089                                    c = (byte) 0x00;
090                            } else if (i*3 + 2 >= data.length) {
091                                    b = data[i * 3 + 1];
092                                    c = (byte) 0x00;
093                            } else {
094                                    b = data[i * 3 + 1];
095                                    c = data[i * 3 + 2];
096                            }
097    
098                            char c1, c2, c3, c4;
099                            // Take top 6 bits of 'a'
100                            c1 = (char) (unsignedRightShift(a, 2) + 32);
101                            // Take bottom 2 bits of 'a' and upper 4 bits of 'b'
102                            c2 = (char) ((unsignedRightShift((byte)(a << 6), 2)
103                                    | unsignedRightShift(b, 4)) + 32);
104                            // Take lower 4 bits of b and upper 2 bits of 'c'
105                            c3 = (char) ((unsignedRightShift((byte)(b << 4), 2)
106                                    | unsignedRightShift(c, 6)) + 32);
107                            // Take lower 6 bits of 'c'
108                            c4 = (char) (unsignedRightShift((byte)(c << 2), 2)
109                                                 + 32);
110                            
111                            // Replace spaces (ASCII 32) with backticks (`, ASCII 96).
112                            if (c1 == 32)
113                              c1 = 96;
114                            if (c2 == 32)
115            c2 = 96;
116                            if (c3 == 32)
117            c3 = 96;
118                            if (c4 == 32)
119            c4 = 96;
120    
121                            result = result + c1 + c2 + c3 + c4;
122                            numChars += 4;
123    
124                            if (numChars >= CHARS_PER_LINE) {
125                                    numChars = 0;
126                                    result = addLineSuffix(result);
127                            }
128                    }
129    
130                    result = addTerminator(result);
131            
132                    return result;
133            }
134    
135            /** Encode a very short block of bytes - Expansion factor is 4 characters for every 3 bytes 
136                    @param data Byte data to be encoded 
137            public static String shortEncode(byte[] data) {
138            String result = new String("");
139            
140            // Encode in units of 3 bytes = 24 bits = 4 sets of 6 bits 
141            for (int i = 0; i < (int) Math.ceil((double) (data.length / 3.0)); i++) {
142                byte a=0,b=0,c=0; 
143                                
144                a = data[i * 3];
145                if (i*3 + 1 >= data.length) {
146                    b = (byte) 0xff;
147                    c = (byte) 0xff;
148                } else if (i*3 + 2 >= data.length) {
149                    b = data[i * 3 + 1];
150                    c = (byte) 0xff;
151                } else {
152                    b = data[i * 3 + 1];
153                    c = data[i * 3 + 2];
154                }
155            
156                char c1, c2, c3, c4;
157    
158                // Take top 6 bits of 'a'
159                c1 = (char) (unsignedRightShift(a, 2) + UUCoder.ASCII_SPACE);
160    
161                // Take bottom 2 bits of 'a' and upper 4 bits of 'b'
162                c2 = (char) ((unsignedRightShift((byte)(a << 6), 2)
163                    | unsignedRightShift(b, 4)) + UUCoder.ASCII_SPACE);
164    
165                // Take lower 4 bits of b and upper 2 bits of 'c'
166                c3 = (char) ((unsignedRightShift((byte)(b << 4), 2)
167                    | unsignedRightShift(c, 6)) + UUCoder.ASCII_SPACE);
168    
169                // Take lower 6 bits of 'c'
170                c4 = (char) (unsignedRightShift((byte)(c << 2), 2)
171                             + UUCoder.ASCII_SPACE);
172                
173                result = result + c1 + c2 + c3 + c4;
174            }
175    
176                    return result;
177            }       
178    
179            /** Decodes a string that resulted from encoding a short array of bytes
180                    @param data The encoded string
181                    @param length The expected length of the byte array.  Must be the exact
182                                              same length as the array encoded by shortEncode 
183            public byte[] shortDecode(String data, int length) {
184                    String line = data; 
185    
186                    int numBytes = length; 
187    
188                    byte[] bytes = new byte[numBytes];
189                    int count = 0; 
190    
191                    if (line.length() % 4 != 0) {
192                            System.out.println("Encoding error!  Line is: " + line.length() +
193                                            " chars long.  The line is:" + line + "and the offending char is"
194                                            + line.charAt(line.length()-1) + " and in decimal, that's"
195                                            + (int)line.charAt(line.length()-1));
196                            System.exit(5);
197                    }   
198    
199                    for (int i=0; i < line.length() / 4; i++) {
200                            char c1, c2, c3, c4;
201    
202                            c1 = line.charAt(i*4);
203                            c2 = line.charAt(i*4 + 1);
204                            c3 = line.charAt(i*4 + 2);
205                            c4 = line.charAt(i*4 + 3);
206    
207                            byte a, b, c;
208    
209                            // Grab upper 6 bits from c1
210                            a =(byte)(((byte) (c1 - UUCoder.ASCII_SPACE)) << 2);
211                            // Grab lower 2 bits from c2
212                            a = (byte) (a | this.unsignedRightShift(
213                                                    (byte)(c2 - UUCoder.ASCII_SPACE), 4));
214    
215                            // Grab upper 4 bits from c2
216                            b = (byte) (((byte)(c2 - UUCoder.ASCII_SPACE)) << 4);
217                            // Grab lower 4 bits from c3
218                            b = (byte) (b | this.unsignedRightShift(
219                                                    (byte)(c3 - UUCoder.ASCII_SPACE), 2));
220    
221                            // Grab upper 2 bits from c3
222                            c = (byte) (((byte)(c3 - UUCoder.ASCII_SPACE)) << 6);
223                            // Grab lower 6 bits from c4
224                            c = (byte) (c | ((byte)(c4 - UUCoder.ASCII_SPACE)));
225    
226                            bytes[count++] = a;
227                            if (count >= numBytes) {break;}
228                            bytes[count++] = b;
229                            if (count >= numBytes) {break;}
230                            bytes[count++] = c;
231                            if (count >= numBytes) {break;}
232                    }
233    
234                    return bytes;
235        }*/
236                    
237            
238                    
239    
240            private static String addLineSuffix(String data) {
241                    return data + "\n";
242            }
243    
244            private static String addTerminator(String data) {
245                    return data + "\n" + UUCoder.TERMINATOR;
246            }
247    
248      /**
249       * Decodes the given string. The format of the encoding is described in the
250       * class description.
251       * 
252       * @param data
253       *        the data to be decoded
254       * @throws IllegalArgumentException
255       *         if <code>data</code> is not a valid encoding
256       * @return a byte array containing the data encoded by <code>data</code>
257       */
258            public static byte[] decode(String data) {
259                    // \r\f = carriage-return and form-feed respectively
260                    String[] lines = data.split("[\n\r\f]");
261                    int numBytes = Integer.parseInt(lines[0].trim());
262                    
263                    byte[] bytes = new byte[numBytes];
264                    int byteCount = 0;
265    
266                    // Start at 1 to skip the first line (processed above)
267                    for (int linecount = 1; linecount < lines.length; linecount++) {
268                            String line = lines[linecount];
269                            
270                            if (line.equals(UUCoder.TERMINATOR)) {
271                                    break;
272                            }
273    
274                            if (line.length() % 4 != 0) {
275                                    throw new IllegalArgumentException("Line " + linecount
276                + " has illegal length of " + line.length() + " characters.");
277                            }
278                            
279                            for (int i=0; i < line.length() / 4; i++) {
280                                    char c1, c2, c3, c4;
281                                    
282                                    c1 = line.charAt(i*4);
283                                    c2 = line.charAt(i*4 + 1);
284                                    c3 = line.charAt(i*4 + 2);
285                                    c4 = line.charAt(i*4 + 3);
286                                    
287            if ( c1 < 33 || c2 < 33 || c3 < 33 || c4 < 33 ||
288                 c1 > 96 || c2 > 96 || c3 > 96 || c4 > 96 ) {
289              throw new IllegalArgumentException("Illegal character on line "
290                  + linecount + " in group " + i + ": " + line.substring(i*4, i*4+4));
291            }
292            
293            // Replace backticks (`, ASCII 96) with spaces (ASCII 32)
294            if (c1 == 96)
295              c1 = 32;
296            if (c2 == 96)
297              c2 = 32;
298            if (c3 == 96)
299              c3 = 32;
300            if (c4 == 96)
301              c4 = 32;
302    
303                                    byte a, b, c;
304                                    
305                                    // Grab upper 6 bits from c1
306                                    a =(byte)(((byte) (c1 - 32)) << 2);
307                                    // Grab lower 2 bits from c2
308                                    a = (byte) (a | unsignedRightShift(
309                                                        (byte)(c2 - 32), 4));
310    
311                                    // Grab upper 4 bits from c2
312                                    b = (byte) (((byte)(c2 - 32)) << 4);
313                                    // Grab lower 4 bits from c3
314                                    b = (byte) (b | unsignedRightShift(
315                                                        (byte)(c3 - 32), 2));
316    
317                                    // Grab upper 2 bits from c3
318                                    c = (byte) (((byte)(c3 - 32)) << 6);
319                                    // Grab lower 6 bits from c4
320                                    c = (byte) (c | ((byte)(c4 - 32))); 
321                                    
322                                    // If we're about to overrun the bytes array, throw an error because
323            // there are too many characters here
324                  if (byteCount >= numBytes) {
325                    throw new IllegalArgumentException(
326                        "Encoded message is too long. (Message was declared to be "
327                            + numBytes + " bytes long.)");
328                  }
329                                    
330                  // Terminate early if we hit the end of the message, but not before
331            // checking that the remainder of the data is in fact zero
332                                    bytes[byteCount++] = a;
333                                    if (byteCount >= numBytes) {
334                                      if (b != 0 || c != 0) {
335                                        throw new IllegalArgumentException(
336                    "Reached end of message, but pad bytes were non-zero (byte 2 = "
337                        + b + ", byte 3 = " + c + ").");
338                                      }
339                                      break;
340                                    }
341                                    
342                                    bytes[byteCount++] = b;
343                                    if (byteCount >= numBytes) {
344                                      if (c != 0) {
345                throw new IllegalArgumentException(
346                    "Reached end of message, but pad byte was non-zero (byte 3 = "
347                        + c + ").");
348              }
349                                      break;
350                                    }
351                                    
352                                    bytes[byteCount++] = c;
353                                    if (byteCount >= numBytes) {
354                                      break;
355                                    }
356                            }
357                    }
358    
359                    return bytes;
360            }
361    
362      /**
363       * Performs an unsigned right shift on bytes. Necessary because Java's
364       * <code>>>></code> only operates on 32-bit values: specifically,
365       * <code>(byte)((byte)0xff >>> 4)</code> yields <code>0xff</code> = -1,
366       * because <code>0xff</code> is sign-extended when it is converted to an
367       * <code>int</code> for use with <code>>>></code>.
368       */
369            private static byte unsignedRightShift(byte b, int num) {
370                    if (num == 0) {
371                            return b;
372                    }
373    
374                    if ((b & ((byte) 1 << 7)) != 0) {     // High-order bit is 1
375                            // First do the right shift
376                            b = (byte) (b >>> num);
377                            
378                            byte temp = (byte) 128;
379                            // Use sign extension to fill in top bits with 1's 
380                            temp = (byte)(temp >> (num - 1)); 
381                            temp = (byte) ~temp;    // Invert to get 0's on top
382    
383                            b = (byte) (b & temp);      // Now clear out the top bits
384    
385                            return b;
386                    } else { // High-order bit is 0, so we can do a normal right shift
387                            return (byte) (b >>> num);
388                    }
389            }
390    }