001////////////////////////////////////////////////////////////////////////////////
002// checkstyle: Checks Java source code for adherence to a set of rules.
003// Copyright (C) 2001-2018 the original author or authors.
004//
005// This library is free software; you can redistribute it and/or
006// modify it under the terms of the GNU Lesser General Public
007// License as published by the Free Software Foundation; either
008// version 2.1 of the License, or (at your option) any later version.
009//
010// This library is distributed in the hope that it will be useful,
011// but WITHOUT ANY WARRANTY; without even the implied warranty of
012// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013// Lesser General Public License for more details.
014//
015// You should have received a copy of the GNU Lesser General Public
016// License along with this library; if not, write to the Free Software
017// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018////////////////////////////////////////////////////////////////////////////////
019
020package com.puppycrawl.tools.checkstyle.checks;
021
022import java.util.List;
023import java.util.Map;
024import java.util.regex.Matcher;
025import java.util.regex.Pattern;
026
027import com.puppycrawl.tools.checkstyle.FileStatefulCheck;
028import com.puppycrawl.tools.checkstyle.api.AbstractCheck;
029import com.puppycrawl.tools.checkstyle.api.DetailAST;
030import com.puppycrawl.tools.checkstyle.api.TextBlock;
031import com.puppycrawl.tools.checkstyle.api.TokenTypes;
032import com.puppycrawl.tools.checkstyle.utils.CommonUtils;
033
034/**
035 * <p>
036 * Restrict using <a href =
037 * "https://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3">
038 * Unicode escapes</a> (such as <code>&#92;u221e</code>).
039 * It is possible to allow using escapes for
040 * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
041 * non-printable(control) characters</a>.
042 * Also, this check can be configured to allow using escapes
043 * if trail comment is present. By the option it is possible to
044 * allow using escapes if literal contains only them. By the option it
045 * is possible to allow using escapes for space literals.
046 * </p>
047 * <p>
048 * Examples of using Unicode:</p>
049 * <pre>
050 * String unitAbbrev = "μs";      // Best: perfectly clear even without a comment.
051 * String unitAbbrev = "&#92;u03bcs"; // Poor: the reader has no idea what this is.
052 * </pre>
053 * <p>
054 * An example of how to configure the check is:
055 * </p>
056 * <pre>
057 * &lt;module name="AvoidEscapedUnicodeCharacters"/&gt;
058 * </pre>
059 * <p>
060 * An example of non-printable(control) characters.
061 * </p>
062 * <pre>
063 * return '&#92;ufeff' + content; // byte order mark
064 * </pre>
065 * <p>
066 * An example of how to configure the check to allow using escapes
067 * for non-printable(control) characters:
068 * </p>
069 * <pre>
070 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
071 *     &lt;property name="allowEscapesForControlCharacters" value="true"/&gt;
072 * &lt;/module&gt;
073 * </pre>
074 * <p>
075 * Example of using escapes with trail comment:
076 * </p>
077 * <pre>
078 * String unitAbbrev = "&#92;u03bcs"; // Greek letter mu, "s"
079 * </pre>
080 * <p>An example of how to configure the check to allow using escapes
081 * if trail comment is present:
082 * </p>
083 * <pre>
084 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
085 *     &lt;property name="allowByTailComment" value="true"/&gt;
086 * &lt;/module&gt;
087 * </pre>
088 * <p>Example of using escapes if literal contains only them:
089 * </p>
090 * <pre>
091 * String unitAbbrev = "&#92;u03bc&#92;u03bc&#92;u03bc";
092 * </pre>
093 * <p>An example of how to configure the check to allow escapes
094 * if literal contains only them:
095 * </p>
096 * <pre>
097 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
098 *    &lt;property name="allowIfAllCharactersEscaped" value="true"/&gt;
099 * &lt;/module&gt;
100 * </pre>
101 * <p>An example of how to configure the check to allow non-printable escapes:
102 * </p>
103 * <pre>
104 * &lt;module name="AvoidEscapedUnicodeCharacters"&gt;
105 *    &lt;property name="allowNonPrintableEscapes" value="true"/&gt;
106 * &lt;/module&gt;
107 * </pre>
108 *
109 * @author maxvetrenko
110 * @noinspection HtmlTagCanBeJavadocTag
111 */
112@FileStatefulCheck
113public class AvoidEscapedUnicodeCharactersCheck
114    extends AbstractCheck {
115
116    /**
117     * A key is pointing to the warning message text in "messages.properties"
118     * file.
119     */
120    public static final String MSG_KEY = "forbid.escaped.unicode.char";
121
122    /** Regular expression for Unicode chars. */
123    private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}");
124
125    /**
126     * Regular expression Unicode control characters.
127     *
128     * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters">
129     *     Appendix:Control characters</a>
130     */
131    private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\[uU]"
132            + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00[aA][dD]|034[fF]|070[fF]"
133            + "|180[eE]|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]"
134            + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})");
135
136    /** Regular expression for all escaped chars. */
137    private static final Pattern ALL_ESCAPED_CHARS =
138            Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}"
139                    + "|\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$");
140
141    /** Regular expression for escaped backslash. */
142    private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\");
143
144    /** Regular expression for non-printable unicode chars. */
145    private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028"
146            + "|\\\\u2029|\\\\u205[fF]|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200[aA]"
147            + "|\\\\u007[fF]|\\\\u009[fF]|\\\\u[fF]{4}|\\\\u00[aA][dD]"
148            + "|\\\\u0600|\\\\u061[cC]|\\\\u06[dD]{2}|\\\\u070[fF]|\\\\u180[eE]"
149            + "|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069"
150            + "|\\\\u206[aA]|\\\\u[dD]800|\\\\u[fF][eE][fF]{2}|\\\\u[fF]{3}9"
151            + "|\\\\u[fF]{3}[aA]|\\\\u0020|\\\\u00[aA]0|\\\\u0604"
152            + "|\\\\u200[fF]"
153            + "|\\\\u202[fF]|\\\\u2064|\\\\u206[fF]"
154            + "|\\\\u[fF]8[fF]{2}|\\\\u[fF]{3}[bB]"
155            + "|\\\\u05[dD]0|\\\\u05[fF]3|\\\\u0750|\\\\u0[eE]00|\\\\u1[eE]00"
156            + "|\\\\u2100|\\\\u[fF][bB]50|\\\\u[fF][eE]70|\\\\u[fF]{2}61|\\\\u04[fF]9"
157            + "|\\\\u05[bB][eE]|\\\\u05[eE][aA]|\\\\u05[fF]4|\\\\u06[fF]{2}"
158            + "|\\\\u077[fF]|\\\\u0[eE]7[fF]|\\\\u20[aA][fF]|\\\\u213[aA]|\\\\u0000"
159            + "|\\\\u[fF][dD][fF]{2}|\\\\u[fF]{2}[dD][cC]"
160            + "|\\\\u2002|\\\\u0085|\\\\u2005|\\\\u000[bB]"
161            + "|\\\\u2008|\\\\u2003|\\\\u0009|\\\\u2006"
162            + "|\\\\u2001|\\\\u000[cC]|\\\\u2009|\\\\u2004"
163            + "|\\\\u2025"
164            + "|\\\\u[fF]{2}0[eE]");
165
166    /** Cpp style comments. */
167    private Map<Integer, TextBlock> singlelineComments;
168    /** C style comments. */
169    private Map<Integer, List<TextBlock>> blockComments;
170
171    /** Allow use escapes for non-printable(control) characters.  */
172    private boolean allowEscapesForControlCharacters;
173
174    /** Allow use escapes if trail comment is present. */
175    private boolean allowByTailComment;
176
177    /** Allow if all characters in literal are escaped. */
178    private boolean allowIfAllCharactersEscaped;
179
180    /** Allow escapes for space literals. */
181    private boolean allowNonPrintableEscapes;
182
183    /**
184     * Set allowIfAllCharactersEscaped.
185     * @param allow user's value.
186     */
187    public final void setAllowEscapesForControlCharacters(boolean allow) {
188        allowEscapesForControlCharacters = allow;
189    }
190
191    /**
192     * Set allowByTailComment.
193     * @param allow user's value.
194     */
195    public final void setAllowByTailComment(boolean allow) {
196        allowByTailComment = allow;
197    }
198
199    /**
200     * Set allowIfAllCharactersEscaped.
201     * @param allow user's value.
202     */
203    public final void setAllowIfAllCharactersEscaped(boolean allow) {
204        allowIfAllCharactersEscaped = allow;
205    }
206
207    /**
208     * Set allowSpaceEscapes.
209     * @param allow user's value.
210     */
211    public final void setAllowNonPrintableEscapes(boolean allow) {
212        allowNonPrintableEscapes = allow;
213    }
214
215    @Override
216    public int[] getDefaultTokens() {
217        return getRequiredTokens();
218    }
219
220    @Override
221    public int[] getAcceptableTokens() {
222        return getRequiredTokens();
223    }
224
225    @Override
226    public int[] getRequiredTokens() {
227        return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL};
228    }
229
230    @Override
231    public void beginTree(DetailAST rootAST) {
232        singlelineComments = getFileContents().getSingleLineComments();
233        blockComments = getFileContents().getBlockComments();
234    }
235
236    @Override
237    public void visitToken(DetailAST ast) {
238        final String literal = ast.getText();
239
240        if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast)
241                || isAllCharactersEscaped(literal)
242                || allowEscapesForControlCharacters
243                        && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL)
244                || allowNonPrintableEscapes
245                        && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) {
246            log(ast.getLineNo(), MSG_KEY);
247        }
248    }
249
250    /**
251     * Checks if literal has Unicode chars.
252     * @param literal String literal.
253     * @return true if literal has Unicode chars.
254     */
255    private static boolean hasUnicodeChar(String literal) {
256        final String literalWithoutEscapedBackslashes =
257                ESCAPED_BACKSLASH.matcher(literal).replaceAll("");
258        return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find();
259    }
260
261    /**
262     * Check if String literal contains Unicode control chars.
263     * @param literal String literal.
264     * @param pattern RegExp for valid characters.
265     * @return true, if String literal contains Unicode control chars.
266     */
267    private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) {
268        final int unicodeMatchesCounter =
269                countMatches(UNICODE_REGEXP, literal);
270        final int unicodeValidMatchesCounter =
271                countMatches(pattern, literal);
272        return unicodeMatchesCounter - unicodeValidMatchesCounter == 0;
273    }
274
275    /**
276     * Check if trail comment is present after ast token.
277     * @param ast current token.
278     * @return true if trail comment is present after ast token.
279     */
280    private boolean hasTrailComment(DetailAST ast) {
281        boolean result = false;
282        final int lineNo = ast.getLineNo();
283        if (singlelineComments.containsKey(lineNo)) {
284            result = true;
285        }
286        else {
287            final List<TextBlock> commentList = blockComments.get(lineNo);
288            if (commentList != null) {
289                final TextBlock comment = commentList.get(commentList.size() - 1);
290                final String line = getLines()[lineNo - 1];
291                result = isTrailingBlockComment(comment, line);
292            }
293        }
294        return result;
295    }
296
297    /**
298     * Whether the C style comment is trailing.
299     * @param comment the comment to check.
300     * @param line the line where the comment starts.
301     * @return true if the comment is trailing.
302     */
303    private static boolean isTrailingBlockComment(TextBlock comment, String line) {
304        return comment.getText().length != 1
305            || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1));
306    }
307
308    /**
309     * Count regexp matches into String literal.
310     * @param pattern pattern.
311     * @param target String literal.
312     * @return count of regexp matches.
313     */
314    private static int countMatches(Pattern pattern, String target) {
315        int matcherCounter = 0;
316        final Matcher matcher = pattern.matcher(target);
317        while (matcher.find()) {
318            matcherCounter++;
319        }
320        return matcherCounter;
321    }
322
323    /**
324     * Checks if all characters in String literal is escaped.
325     * @param literal current literal.
326     * @return true if all characters in String literal is escaped.
327     */
328    private boolean isAllCharactersEscaped(String literal) {
329        return allowIfAllCharactersEscaped
330                && ALL_ESCAPED_CHARS.matcher(literal.substring(1,
331                        literal.length() - 1)).find();
332    }
333
334}