001/*
002    Licensed to the Apache Software Foundation (ASF) under one
003    or more contributor license agreements.  See the NOTICE file
004    distributed with this work for additional information
005    regarding copyright ownership.  The ASF licenses this file
006    to you under the Apache License, Version 2.0 (the
007    "License"); you may not use this file except in compliance
008    with the License.  You may obtain a copy of the License at
009
010       http://www.apache.org/licenses/LICENSE-2.0
011
012    Unless required by applicable law or agreed to in writing,
013    software distributed under the License is distributed on an
014    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015    KIND, either express or implied.  See the License for the
016    specific language governing permissions and limitations
017    under the License.  
018 */
019package org.apache.wiki.filters;
020
021import net.sf.akismet.Akismet;
022import org.apache.commons.lang3.StringUtils;
023import org.apache.commons.lang3.time.StopWatch;
024import org.apache.logging.log4j.LogManager;
025import org.apache.logging.log4j.Logger;
026import org.apache.oro.text.regex.MalformedPatternException;
027import org.apache.oro.text.regex.MatchResult;
028import org.apache.oro.text.regex.Pattern;
029import org.apache.oro.text.regex.PatternCompiler;
030import org.apache.oro.text.regex.PatternMatcher;
031import org.apache.oro.text.regex.Perl5Compiler;
032import org.apache.oro.text.regex.Perl5Matcher;
033import org.apache.wiki.InternalWikiException;
034import org.apache.wiki.api.core.Attachment;
035import org.apache.wiki.api.core.Context;
036import org.apache.wiki.api.core.ContextEnum;
037import org.apache.wiki.api.core.Engine;
038import org.apache.wiki.api.core.Page;
039import org.apache.wiki.api.exceptions.ProviderException;
040import org.apache.wiki.api.exceptions.RedirectException;
041import org.apache.wiki.api.filters.BasePageFilter;
042import org.apache.wiki.api.providers.WikiProvider;
043import org.apache.wiki.attachment.AttachmentManager;
044import org.apache.wiki.auth.user.UserProfile;
045import org.apache.wiki.pages.PageManager;
046import org.apache.wiki.ui.EditorManager;
047import org.apache.wiki.util.FileUtil;
048import org.apache.wiki.util.HttpUtil;
049import org.apache.wiki.util.TextUtil;
050import org.suigeneris.jrcs.diff.Diff;
051import org.suigeneris.jrcs.diff.DifferentiationFailedException;
052import org.suigeneris.jrcs.diff.Revision;
053import org.suigeneris.jrcs.diff.delta.AddDelta;
054import org.suigeneris.jrcs.diff.delta.ChangeDelta;
055import org.suigeneris.jrcs.diff.delta.DeleteDelta;
056import org.suigeneris.jrcs.diff.delta.Delta;
057import org.suigeneris.jrcs.diff.myers.MyersDiff;
058
059import javax.servlet.http.HttpServletRequest;
060import javax.servlet.http.HttpServletResponse;
061import javax.servlet.jsp.PageContext;
062import java.io.BufferedReader;
063import java.io.IOException;
064import java.io.InputStream;
065import java.io.InputStreamReader;
066import java.io.StringReader;
067import java.io.StringWriter;
068import java.nio.charset.StandardCharsets;
069import java.util.ArrayList;
070import java.util.Arrays;
071import java.util.Collection;
072import java.util.Date;
073import java.util.Iterator;
074import java.util.List;
075import java.util.Properties;
076import java.util.Random;
077import java.util.StringTokenizer;
078import java.util.Vector;
079import java.util.concurrent.ThreadLocalRandom;
080
081
082/**
083 *  This is Herb, the JSPWiki spamfilter that can also do choke modifications.
084 *
085 *  Parameters:
086 *  <ul>
087 *    <li>wordlist - Page name where the spamword regexps are found.  Use [{SET spamwords='regexp list separated with spaces'}] on
088 *     that page.  Default is "SpamFilterWordList".
089 *    <li>IPlist - Page name where the IP regexps are found.  Use [{SET ips='regexp list separated with spaces'}] on
090 *     that page.  Default is "SpamFilterIPList".
091 *    <li>maxpagenamelength - Maximum page name length. Default is 100.
092 *    <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is
093 *        "SpamFilterWordList/blacklist.txt"</li>
094 *    <li>errorpage - The page to which the user is redirected.  Has a special variable $msg which states the reason. Default is "RejectedMessage".
095 *    <li>pagechangesinminute - How many page changes are allowed/minute.  Default is 5.</li>
096 *    <li>similarchanges - How many similar page changes are allowed before the host is banned.  Default is 2.  (since 2.4.72)</li>
097 *    <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li>
098 *    <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li>
099 *    <li>akismet-apikey - The Akismet API key (see akismet.org)</li>
100 *    <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li>
101 *    <li>captcha - Sets the captcha technology to use.  Current allowed values are "none" and "asirra".</li>
102 *    <li>strategy - Sets the filtering strategy to use.  If set to "eager", will stop at the first probable
103 *        match, and won't consider any other tests.  This is the default, as it's considerably lighter. If set to "score", will go through all of the tests
104 *        and calculates a score for the spam, which is then compared to a filter level value.
105 *  </ul>
106 *
107 *  <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates
108 *  with the editor system.</p>
109 *  
110 *  <p>Changes by admin users are ignored in any case.</p>
111 *
112 *  @since 2.1.112
113 */
114public class SpamFilter extends BasePageFilter {
115    
116    private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score";
117    private static final String REASON_REGEXP = "Regexp";
118    private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily";
119    private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently";
120    private static final String REASON_BOT_TRAP = "BotTrap";
121    private static final String REASON_AKISMET = "Akismet";
122    private static final String REASON_TOO_MANY_URLS = "TooManyUrls";
123    private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications";
124    private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications";
125    private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong";
126    private static final String REASON_UTF8_TRAP = "UTF8Trap";
127
128    private static final String LISTVAR = "spamwords";
129    private static final String LISTIPVAR = "ips";
130
131    private static final Random RANDOM = ThreadLocalRandom.current();
132
133    /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */
134    public static final String  PROP_WORDLIST              = "wordlist";
135
136    /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */
137    public static final String  PROP_IPLIST                = "IPlist";
138
139    /** The filter property name for specifying the maximum page name length.  Value is <tt>{@value}</tt>. */
140    public static final String  PROP_MAX_PAGENAME_LENGTH   = "maxpagenamelength";
141
142    /** The filter property name for the page to which you are directed if Herb rejects your edit.  Value is <tt>{@value}</tt>. */
143    public static final String  PROP_ERRORPAGE             = "errorpage";
144    
145    /** The filter property name for specifying how many changes is any given IP address
146     *  allowed to do per minute.  Value is <tt>{@value}</tt>.
147     */
148    public static final String  PROP_PAGECHANGES           = "pagechangesinminute";
149    
150    /** The filter property name for specifying how many similar changes are allowed before a host is banned.  Value is <tt>{@value}</tt>. */
151    public static final String  PROP_SIMILARCHANGES        = "similarchanges";
152    
153    /** The filter property name for specifying how long a host is banned.  Value is <tt>{@value}</tt>.*/
154    public static final String  PROP_BANTIME               = "bantime";
155    
156    /** The filter property name for the attachment containing the blacklist.  Value is <tt>{@value}</tt>.*/
157    public static final String  PROP_BLACKLIST             = "blacklist";
158    
159    /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */
160    public static final String  PROP_MAXURLS               = "maxurls";
161    
162    /** The filter property name for specifying the Akismet API-key.  Value is <tt>{@value}</tt>. */
163    public static final String  PROP_AKISMET_API_KEY       = "akismet-apikey";
164    
165    /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */
166    public static final String  PROP_IGNORE_AUTHENTICATED  = "ignoreauthenticated";
167
168    /** The filter property name for specifying groups allowed to bypass the spam filter. Value is <tt>{@value}</tt>. */
169    public static final String PROP_ALLOWED_GROUPS = "jspwiki.filters.spamfilter.allowedgroups";
170    
171    /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */
172    public static final String  PROP_CAPTCHA               = "captcha";
173    
174    /** The filter property name for specifying which filter strategy should be used.  Value is <tt>{@value}</tt>. */
175    public static final String  PROP_FILTERSTRATEGY        = "strategy";
176
177    /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */
178    public static final String  STRATEGY_EAGER             = "eager";
179    
180    /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */
181    public static final String  STRATEGY_SCORE             = "score";
182
183    private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)";
184
185    private String          m_forbiddenWordsPage = "SpamFilterWordList";
186    private String          m_forbiddenIPsPage   = "SpamFilterIPList";
187    private String          m_pageNameMaxLength  = "100";
188    private String          m_errorPage          = "RejectedMessage";
189    private String          m_blacklist          = "SpamFilterWordList/blacklist.txt";
190
191    private final PatternMatcher  m_matcher = new Perl5Matcher();
192    private final PatternCompiler m_compiler = new Perl5Compiler();
193
194    private Collection<Pattern> m_spamPatterns;
195    private Collection<Pattern> m_IPPatterns;
196
197    private Date m_lastRebuild = new Date( 0L );
198
199    private static final Logger C_SPAMLOG = LogManager.getLogger( "SpamLog" );
200    private static final Logger LOG = LogManager.getLogger( SpamFilter.class );
201
202    private final Vector<Host>    m_temporaryBanList = new Vector<>();
203
204    private int             m_banTime = 60; // minutes
205
206    private final Vector<Host>    m_lastModifications = new Vector<>();
207
208    /** How many times a single IP address can change a page per minute? */
209    private int             m_limitSinglePageChanges = 5;
210
211    /** How many times can you add the exact same string to a page? */
212    private int             m_limitSimilarChanges = 2;
213
214    /** How many URLs can be added at maximum. */
215    private int             m_maxUrls = 10;
216
217    private Pattern         m_urlPattern;
218    private Akismet         m_akismet;
219
220    private String          m_akismetAPIKey;
221
222    private boolean         m_useCaptcha;
223
224    /** The limit at which we consider something to be spam. */
225    private final int             m_scoreLimit = 1;
226
227    /** If set to true, will ignore anyone who is in Authenticated role. */
228    private boolean         m_ignoreAuthenticated;
229
230    /** Groups allowed to bypass the filter */
231    private String[]         m_allowedGroups;
232
233    private boolean         m_stopAtFirstMatch = true;
234
235    private static String   c_hashName;
236    private static long     c_lastUpdate;
237
238    /** The HASH_DELAY value is a maximum amount of time that an user can keep
239     *  a session open, because after the value has expired, we will invent a new
240     *  hash field name.  By default this is {@value} hours, which should be ample
241     *  time for someone.
242     */
243    private static final long HASH_DELAY = 24;
244
245
246    /**
247     *  {@inheritDoc}
248     */
249    @Override
250    public void initialize( final Engine engine, final Properties properties ) {
251        m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage );
252        m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage);
253        m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength);
254        m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage );
255        m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges );
256        
257        m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges );
258
259        m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls );
260        m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime );
261        m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist );
262
263        m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated );
264        m_allowedGroups = StringUtils.split( StringUtils.defaultString( properties.getProperty( PROP_ALLOWED_GROUPS, m_blacklist ) ), ',' );
265
266        m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra");
267
268        try {
269            m_urlPattern = m_compiler.compile( URL_REGEXP );
270        } catch( final MalformedPatternException e ) {
271            LOG.fatal( "Internal error: Someone put in a faulty pattern.", e );
272            throw new InternalWikiException( "Faulty pattern." , e);
273        }
274
275        m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey );
276        m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER );
277
278        LOG.info( "# Spam filter initialized.  Temporary ban time " + m_banTime +
279                  " mins, max page changes/minute: " + m_limitSinglePageChanges );
280    }
281
282    private static final int REJECT = 0;
283    private static final int ACCEPT = 1;
284    private static final int NOTE   = 2;
285
286    private static String log( final Context ctx, final int type, final String source, String message ) {
287        message = TextUtil.replaceString( message, "\r\n", "\\r\\n" );
288        message = TextUtil.replaceString( message, "\"", "\\\"" );
289
290        final String uid = getUniqueID();
291        final String page   = ctx.getPage().getName();
292        final String addr   = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-";
293        final String reason;
294        switch( type ) {
295            case REJECT: reason = "REJECTED";
296                break;
297            case ACCEPT: reason = "ACCEPTED";
298                break;
299            case NOTE: reason = "NOTE";
300                break;
301            default: throw new InternalWikiException( "Illegal type " + type );
302        }
303        C_SPAMLOG.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message );
304
305        return uid;
306    }
307
308    /** {@inheritDoc} */
309    @Override
310    public String preSave( final Context context, final String content ) throws RedirectException {
311        cleanBanList();
312        refreshBlacklists( context );
313        final Change change = getChange( context, content );
314
315        if( !ignoreThisUser( context ) ) {
316            checkBanList( context, change );
317            checkSinglePageChange( context, change );
318            checkIPList( context );
319            checkPatternList( context, change );
320            checkPageName( context);
321        }
322
323        if( !m_stopAtFirstMatch ) {
324            final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
325
326            if( score != null && score >= m_scoreLimit ) {
327                throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) );
328            }
329        }
330
331        log( context, ACCEPT, "-", change.toString() );
332        return content;
333    }
334
335    private void checkPageName(final Context context ) throws RedirectException {
336        final Page page = context.getPage();
337        final String pageName = page.getName();
338        final int maxlength = Integer.parseInt(m_pageNameMaxLength);
339        if ( pageName.length() > maxlength) {
340            //
341            //  Spam filter has a match.
342            //
343
344            final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName);
345
346            LOG.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")");
347            checkStrategy( context, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" );
348
349        }
350    }
351
352    private void checkStrategy(final Context context, final String message ) throws RedirectException {
353        if( m_stopAtFirstMatch ) {
354            throw new RedirectException( message, getRedirectPage( context ) );
355        }
356
357        Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE );
358        if( score != null ) {
359            score = score + 1;
360        } else {
361            score = 1;
362        }
363
364        context.setVariable( ATTR_SPAMFILTER_SCORE, score );
365    }
366    
367    /**
368     *  Parses a list of patterns and returns a Collection of compiled Pattern objects.
369     *
370     * @param source page containing the list of patterns.
371     * @param list list of patterns.
372     * @return A Collection of the Patterns that were found from the lists.
373     */
374    private Collection< Pattern > parseWordList( final Page source, final String list ) {
375        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
376
377        if( list != null ) {
378            final StringTokenizer tok = new StringTokenizer( list, " \t\n" );
379
380            while( tok.hasMoreTokens() ) {
381                final String pattern = tok.nextToken();
382
383                try {
384                    compiledpatterns.add( m_compiler.compile( pattern ) );
385                } catch( final MalformedPatternException e ) {
386                    LOG.debug( "Malformed spam filter pattern " + pattern );
387                    source.setAttribute("error", "Malformed spam filter pattern " + pattern);
388                }
389            }
390        }
391
392        return compiledpatterns;
393    }
394
395    /**
396     *  Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects.
397     *
398     *  @param list list of patterns.
399     *  @return The parsed blacklist patterns.
400     */
401    private Collection< Pattern > parseBlacklist( final String list ) {
402        final ArrayList< Pattern > compiledpatterns = new ArrayList<>();
403
404        if( list != null ) {
405            try {
406                final BufferedReader in = new BufferedReader( new StringReader(list) );
407                String line;
408                while( (line = in.readLine() ) != null ) {
409                    line = line.trim();
410                    if( line.isEmpty() ) continue; // Empty line
411                    if( line.startsWith("#") ) continue; // It's a comment
412
413                    int ws = line.indexOf( ' ' );
414                    if( ws == -1 ) ws = line.indexOf( '\t' );
415                    if( ws != -1 ) line = line.substring( 0, ws );
416
417                    try {
418                        compiledpatterns.add( m_compiler.compile( line ) );
419                    } catch( final MalformedPatternException e ) {
420                        LOG.debug( "Malformed spam filter pattern " + line );
421                    }
422                }
423            } catch( final IOException e ) {
424                LOG.info( "Could not read patterns; returning what I got" , e );
425            }
426        }
427
428        return compiledpatterns;
429    }
430
431    /**
432     * Takes a single page change and performs a load of tests on the content change. An admin can modify anything.
433     *
434     * @param context page Context
435     * @param change page change
436     * @throws RedirectException spam filter rejects the page change.
437     */
438    private synchronized void checkSinglePageChange(final Context context, final Change change )
439            throws RedirectException {
440        final HttpServletRequest req = context.getHttpRequest();
441
442        if( req != null ) {
443            final String addr = HttpUtil.getRemoteAddress( req );
444            int hostCounter = 0;
445            int changeCounter = 0;
446
447            LOG.debug( "Change is " + change.m_change );
448
449            final long time = System.currentTimeMillis() - 60*1000L; // 1 minute
450
451            for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) {
452                final Host host = i.next();
453
454                //  Check if this item is invalid
455                if( host.getAddedTime() < time ) {
456                    LOG.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" );
457                    i.remove();
458                    continue;
459                }
460
461                // Check if this IP address has been seen before
462                if( host.getAddress().equals( addr ) ) {
463                    hostCounter++;
464                }
465
466                //  Check, if this change has been seen before
467                if( host.getChange() != null && host.getChange().equals( change ) ) {
468                    changeCounter++;
469                }
470            }
471
472            //  Now, let's check against the limits.
473            if( hostCounter >= m_limitSinglePageChanges ) {
474                final Host host = new Host( addr, null );
475                m_temporaryBanList.add( host );
476
477                final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change );
478                LOG.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" );
479                checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
480            }
481
482            if( changeCounter >= m_limitSimilarChanges ) {
483                final Host host = new Host( addr, null );
484                m_temporaryBanList.add( host );
485
486                final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change );
487                LOG.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" );
488                checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")");
489            }
490
491            //  Calculate the number of links in the addition.
492            String tstChange  = change.toString();
493            int urlCounter = 0;
494            while( m_matcher.contains( tstChange,m_urlPattern ) ) {
495                final MatchResult m = m_matcher.getMatch();
496                tstChange = tstChange.substring( m.endOffset(0) );
497                urlCounter++;
498            }
499
500            if( urlCounter > m_maxUrls ) {
501                final Host host = new Host( addr, null );
502                m_temporaryBanList.add( host );
503
504                final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() );
505                LOG.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" );
506                checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" );
507            }
508
509            //  Check bot trap
510            checkBotTrap( context, change );
511
512            //  Check UTF-8 mangling
513            checkUTF8( context, change );
514
515            //  Do Akismet check.  This is good to be the last, because this is the most expensive operation.
516            checkAkismet( context, change );
517
518            m_lastModifications.add( new Host( addr, change ) );
519        }
520    }
521
522
523    /**
524     *  Checks against the akismet system.
525     *
526     * @param context page Context
527     * @throws RedirectException spam filter rejects the page change.
528     */
529    private void checkAkismet( final Context context, final Change change ) throws RedirectException {
530        if( m_akismetAPIKey != null ) {
531            if( m_akismet == null ) {
532                LOG.info( "Initializing Akismet spam protection." );
533                m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() );
534
535                if( !m_akismet.verifyAPIKey() ) {
536                    LOG.error( "Akismet API key cannot be verified.  Please check your config." );
537                    m_akismetAPIKey = null;
538                    m_akismet = null;
539                }
540            }
541
542            final HttpServletRequest req = context.getHttpRequest();
543
544            //  Akismet will mark all empty statements as spam, so we'll just ignore them.
545            if( change.m_adds == 0 && change.m_removals > 0 ) {
546                return;
547            }
548            
549            if( req != null && m_akismet != null ) {
550                LOG.debug( "Calling Akismet to check for spam..." );
551
552                final StopWatch sw = new StopWatch();
553                sw.start();
554
555                final String ipAddress     = HttpUtil.getRemoteAddress( req );
556                final String userAgent     = req.getHeader( "User-Agent" );
557                final String referrer      = req.getHeader( "Referer");
558                final String permalink     = context.getViewURL( context.getPage().getName() );
559                final String commentType   = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit";
560                final String commentAuthor = context.getCurrentUser().getName();
561                final String commentAuthorEmail = null;
562                final String commentAuthorURL   = null;
563
564                final boolean isSpam = m_akismet.commentCheck( ipAddress,
565                                                               userAgent,
566                                                               referrer,
567                                                               permalink,
568                                                               commentType,
569                                                               commentAuthor,
570                                                               commentAuthorEmail,
571                                                               commentAuthorURL,
572                                                               change.toString(),
573                                                               null );
574
575                sw.stop();
576                LOG.debug( "Akismet request done in: " + sw );
577
578                if( isSpam ) {
579                    // Host host = new Host( ipAddress, null );
580                    // m_temporaryBanList.add( host );
581
582                    final String uid = log( context, REJECT, REASON_AKISMET, change.toString() );
583                    LOG.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." );
584                    checkStrategy( context, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" );
585                }
586            }
587        }
588    }
589
590    /**
591     * Returns a static string which can be used to detect spambots which just wildly fill in all the fields.
592     *
593     * @return A string
594     */
595    public static String getBotFieldName() {
596        return "submit_auth";
597    }
598
599    /**
600     * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam.
601     *
602     * @param context page Context
603     * @param change page change
604     * @throws RedirectException spam filter rejects the page change.
605     */
606    private void checkBotTrap( final Context context, final Change change ) throws RedirectException {
607        final HttpServletRequest request = context.getHttpRequest();
608        if( request != null ) {
609            final String unspam = request.getParameter( getBotFieldName() );
610            if( unspam != null && !unspam.isEmpty() ) {
611                final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() );
612
613                LOG.info( "SPAM:BotTrap (" + uid + ").  Wildly behaving bot detected." );
614                checkStrategy( context, "Spamming attempt detected. (Incident code " + uid + ")" );
615            }
616        }
617    }
618
619    private void checkUTF8( final Context context, final Change change ) throws RedirectException {
620        final HttpServletRequest request = context.getHttpRequest();
621        if( request != null ) {
622            final String utf8field = request.getParameter( "encodingcheck" );
623            if( utf8field != null && !utf8field.equals( "\u3041" ) ) {
624                final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() );
625
626                LOG.info( "SPAM:UTF8Trap (" + uid + ").  Wildly posting dumb bot detected." );
627                checkStrategy( context, "Spamming attempt detected. (Incident code " + uid + ")" );
628            }
629        }
630    }
631
632    /** Goes through the ban list and cleans away any host which has expired from it. */
633    private synchronized void cleanBanList() {
634        final long now = System.currentTimeMillis();
635        for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) {
636            final Host host = i.next();
637
638            if( host.getReleaseTime() < now ) {
639                LOG.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" );
640                i.remove();
641            }
642        }
643    }
644
645    /**
646     *  Checks the ban list if the IP address of the changer is already on it.
647     *
648     *  @param context page context
649     *  @throws RedirectException spam filter rejects the page change.
650     */
651    private void checkBanList( final Context context, final Change change ) throws RedirectException {
652        final HttpServletRequest req = context.getHttpRequest();
653
654        if( req != null ) {
655            final String remote = HttpUtil.getRemoteAddress(req);
656            final long now = System.currentTimeMillis();
657
658            for( final Host host : m_temporaryBanList ) {
659                if( host.getAddress().equals( remote ) ) {
660                    final long timeleft = ( host.getReleaseTime() - now ) / 1000L;
661
662                    log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change );
663                    checkStrategy( context,
664                            "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" );
665                }
666            }
667        }
668    }
669
670    /**
671     *  If the spam filter notices changes in the black list page, it will refresh them automatically.
672     *
673     *  @param context associated WikiContext
674     */
675    private void refreshBlacklists( final Context context ) {
676        try {
677            boolean rebuild = false;
678
679            //  Rebuild, if the spam words page, the attachment or the IP ban page has changed since.
680            final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage );
681            if( sourceSpam != null ) {
682                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) {
683                    rebuild = true;
684                }
685            }
686
687            final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist );
688            if( att != null ) {
689                if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) {
690                    rebuild = true;
691                }
692            }
693
694            final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage );
695            if( sourceIPs != null ) {
696                if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) {
697                    rebuild = true;
698                }
699            }
700
701            //  Do the actual rebuilding.  For simplicity's sake, we always rebuild the complete filter list regardless of what changed.
702            if( rebuild ) {
703                m_lastRebuild = new Date();
704                m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null );
705
706                LOG.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage );
707
708                m_IPPatterns = parseWordList( sourceIPs,  ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null );
709                LOG.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage );
710
711                if( att != null ) {
712                    final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att);
713                    final StringWriter out = new StringWriter();
714                    FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out );
715                    final Collection< Pattern > blackList = parseBlacklist( out.toString() );
716                    LOG.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist );
717                    m_spamPatterns.addAll( blackList );
718                }
719            }
720        } catch( final IOException ex ) {
721            LOG.info( "Unable to read attachment data, continuing...", ex );
722        } catch( final ProviderException ex ) {
723            LOG.info( "Failed to read spam filter attachment, continuing...", ex );
724        }
725    }
726
727    /**
728     * Does a check against a known pattern list.
729     *
730     * @param context page Context
731     * @param change page change
732     * @throws RedirectException spam filter rejects the page change.
733     */
734    private void checkPatternList( final Context context, final Change change ) throws RedirectException {
735        // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return.
736        if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) {
737            return;
738        }
739
740        String ch = change.toString();
741        if( context.getHttpRequest() != null ) {
742            ch += HttpUtil.getRemoteAddress( context.getHttpRequest() );
743        }
744
745        for( final Pattern p : m_spamPatterns ) {
746            // LOG.debug("Attempting to match page contents with "+p.getPattern());
747
748            if( m_matcher.contains( ch, p ) ) {
749                //  Spam filter has a match.
750                final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch );
751
752                LOG.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" );
753                checkStrategy( context, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" );
754            }
755        }
756    }
757
758
759    /**
760     *  Does a check against a pattern list of IPs.
761     *
762     *  @param context page context
763     *  @throws RedirectException spam filter rejects the page change.
764     */
765    private void checkIPList( final Context context ) throws RedirectException {
766        //  If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return.
767        if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) {
768            return;
769        }
770
771        final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() );
772        LOG.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns");
773
774        for( final Pattern p : m_IPPatterns ) {
775             LOG.debug("Attempting to match remoteIP with " + p.getPattern());
776
777            if( m_matcher.contains( remoteIP, p ) ) {
778
779                //  IP filter has a match.
780                //
781                final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP );
782
783                LOG.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" );
784                checkStrategy( context, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" );
785            }
786        }
787    }
788
789    private void checkPatternList( final Context context, final String change ) throws RedirectException {
790        final Change c = new Change();
791        c.m_change = change;
792        checkPatternList( context, c );
793    }
794 
795    /**
796     *  Creates a simple text string describing the added content.
797     *
798     *  @param context page context
799     *  @param newText added content
800     *  @return Empty string, if there is no change.
801     */
802    private static Change getChange( final Context context, final String newText ) {
803        final Page page = context.getPage();
804        final StringBuffer change = new StringBuffer();
805        final Engine engine = context.getEngine();
806        // Get current page version
807
808        final Change ch = new Change();
809        
810        try {
811            final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION );
812            final String[] first  = Diff.stringToArray( oldText );
813            final String[] second = Diff.stringToArray( newText );
814            final Revision rev = Diff.diff( first, second, new MyersDiff() );
815
816            if( rev == null || rev.size() == 0 ) {
817                return ch;
818            }
819            
820            for( int i = 0; i < rev.size(); i++ ) {
821                final Delta d = rev.getDelta( i );
822
823                if( d instanceof AddDelta ) {
824                    d.getRevised().toString( change, "", "\r\n" );
825                    ch.m_adds++;
826                    
827                } else if( d instanceof ChangeDelta ) {
828                    d.getRevised().toString( change, "", "\r\n" );
829                    ch.m_adds++;
830                    
831                } else if( d instanceof DeleteDelta ) {
832                    ch.m_removals++;
833                }
834            }
835        } catch( final DifferentiationFailedException e ) {
836            LOG.error( "Diff failed", e );
837        }
838
839        //  Don't forget to include the change note, too
840        final String changeNote = page.getAttribute( Page.CHANGENOTE );
841        if( changeNote != null ) {
842            change.append( "\r\n" );
843            change.append( changeNote );
844        }
845
846        //  And author as well
847        if( page.getAuthor() != null ) {
848            change.append( "\r\n" ).append( page.getAuthor() );
849        }
850
851        ch.m_change = change.toString();
852        return ch;
853    }
854
855    /**
856     * Returns true, if this user should be ignored.  For example, admin users.
857     *
858     * @param context page context
859     * @return True, if this user should be ignored.
860     */
861    private boolean ignoreThisUser( final Context context ) {
862        if( context.hasAdminPermissions() ) {
863            return true;
864        }
865
866        final List< String > groups = Arrays.asList( m_allowedGroups );
867        if( Arrays.stream( context.getWikiSession().getRoles() ).anyMatch( role -> groups.contains( role.getName() ) ) ) {
868            return true;
869        }
870
871        if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) {
872            return true;
873        }
874
875        return context.getVariable("captcha") != null;
876    }
877
878    /**
879     *  Returns a random string of six uppercase characters.
880     *
881     *  @return A random string
882     */
883    private static String getUniqueID() {
884        final StringBuilder sb = new StringBuilder();
885        for( int i = 0; i < 6; i++ ) {
886            final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) );
887            sb.append( x );
888        }
889
890        return sb.toString();
891    }
892
893    /**
894     *  Returns a page to which we shall redirect, based on the current value of the "captcha" parameter.
895     *
896     *  @param ctx WikiContext
897     *  @return An URL to redirect to
898     */
899    private String getRedirectPage( final Context ctx ) {
900        if( m_useCaptcha ) {
901            return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) );
902        }
903
904        return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage );
905    }
906
907    /**
908     *  Checks whether the UserProfile matches certain checks.
909     *
910     *  @param profile The profile to check
911     *  @param context The WikiContext
912     *  @return False, if this userprofile is suspect and should not be allowed to be added.
913     *  @since 2.6.1
914     */
915    public boolean isValidUserProfile( final Context context, final UserProfile profile ) {
916        try {
917            checkPatternList( context, profile.getEmail() );
918            checkPatternList( context, profile.getFullname() );
919            checkPatternList( context, profile.getLoginName() );
920        } catch( final RedirectException e ) {
921            LOG.info("Detected attempt to create a spammer user account (see above for rejection reason)");
922            return false;
923        }
924
925        return true;
926    }
927
928    /**
929     *  This method is used to calculate an unique code when submitting the page to detect edit conflicts.  
930     *  It currently incorporates the last-modified date of the page, and the IP address of the submitter.
931     *
932     *  @param page The WikiPage under edit
933     *  @param request The HTTP Request
934     *  @since 2.6
935     *  @return A hash value for this page and session
936     */
937    public static String getSpamHash( final Page page, final HttpServletRequest request ) {
938        long lastModified = 0;
939
940        if( page.getLastModified() != null ) {
941            lastModified = page.getLastModified().getTime();
942        }
943        final long remote = HttpUtil.getRemoteAddress( request ).hashCode();
944
945        return Long.toString( lastModified ^ remote );
946    }
947
948    /**
949     *  Returns the name of the hash field to be used in this request. The value is unique per session, and once 
950     *  the session has expired, you cannot edit anymore.
951     *
952     *  @param request The page request
953     *  @return The name to be used in the hash field
954     *  @since  2.6
955     */
956    public static String getHashFieldName( final HttpServletRequest request ) {
957        String hash = null;
958
959        if( request.getSession() != null ) {
960            hash = ( String )request.getSession().getAttribute( "_hash" );
961
962            if( hash == null ) {
963                hash = c_hashName;
964                request.getSession().setAttribute( "_hash", hash );
965            }
966        }
967
968        if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) {
969            c_hashName = getUniqueID().toLowerCase();
970            c_lastUpdate = System.currentTimeMillis();
971        }
972
973        return hash != null ? hash : c_hashName;
974    }
975
976
977    /**
978     *  This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 
979     *  either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 
980     *  and their session has expired.
981     *  <p>
982     *  This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 
983     *  the spam log (it may or may not be spam, but it's rather likely that it is).
984     *
985     *  @param context The WikiContext
986     *  @param pageContext The JSP PageContext.
987     *  @return True, if hash is okay.  False, if hash is not okay, and you need to redirect.
988     *  @throws IOException If redirection fails
989     *  @since 2.6
990     */
991    public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException {
992        final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() );
993        if( pageContext.getRequest().getParameter(hashName) == null ) {
994            if( pageContext.getAttribute( hashName ) == null ) {
995                final Change change = getChange( context, EditorManager.getEditedText( pageContext ) );
996                log( context, REJECT, "MissingHash", change.m_change );
997
998                final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" );
999                ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect );
1000                return false;
1001            }
1002        }
1003
1004        return true;
1005    }
1006
1007    /**
1008     * This helper method adds all the input fields to your editor that the SpamFilter requires
1009     * to check for spam.  This <i>must</i> be in your editor form if you intend to use the SpamFilter.
1010     *  
1011     * @param pageContext The PageContext
1012     * @return A HTML string which contains input fields for the SpamFilter.
1013     */
1014    public static String insertInputFields( final PageContext pageContext ) {
1015        final Context ctx = Context.findContext( pageContext );
1016        final Engine engine = ctx.getEngine();
1017        final StringBuilder sb = new StringBuilder();
1018        if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) {
1019            sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" );
1020        }
1021
1022        return sb.toString();
1023    }
1024    
1025    /**
1026     *  A local class for storing host information.
1027     */
1028    private class Host {
1029
1030        private final long m_addedTime = System.currentTimeMillis();
1031        private final long m_releaseTime;
1032        private final String m_address;
1033        private final Change m_change;
1034
1035        public String getAddress() {
1036            return m_address;
1037        }
1038
1039        public long getReleaseTime() {
1040            return m_releaseTime;
1041        }
1042
1043        public long getAddedTime() {
1044            return m_addedTime;
1045        }
1046
1047        public Change getChange() {
1048            return m_change;
1049        }
1050
1051        public Host( final String ipaddress, final Change change ) {
1052            m_address = ipaddress;
1053            m_change = change;
1054            m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L;
1055        }
1056        
1057    }
1058    
1059    private static class Change {
1060        
1061        public String m_change;
1062        public int    m_adds;
1063        public int    m_removals;
1064
1065        @Override
1066        public String toString() {
1067            return m_change;
1068        }
1069
1070        @Override
1071        public boolean equals( final Object o ) {
1072            if( o instanceof Change ) {
1073                return m_change.equals( ( ( Change )o ).m_change );
1074            }
1075            return false;
1076        }
1077
1078        @Override
1079        public int hashCode() {
1080            return m_change.hashCode() + 17;
1081        }
1082        
1083    }
1084
1085}