001/* 002 Licensed to the Apache Software Foundation (ASF) under one 003 or more contributor license agreements. See the NOTICE file 004 distributed with this work for additional information 005 regarding copyright ownership. The ASF licenses this file 006 to you under the Apache License, Version 2.0 (the 007 "License"); you may not use this file except in compliance 008 with the License. You may obtain a copy of the License at 009 010 http://www.apache.org/licenses/LICENSE-2.0 011 012 Unless required by applicable law or agreed to in writing, 013 software distributed under the License is distributed on an 014 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 015 KIND, either express or implied. See the License for the 016 specific language governing permissions and limitations 017 under the License. 018 */ 019package org.apache.wiki.filters; 020 021import net.sf.akismet.Akismet; 022import org.apache.commons.lang3.StringUtils; 023import org.apache.commons.lang3.time.StopWatch; 024import org.apache.logging.log4j.LogManager; 025import org.apache.logging.log4j.Logger; 026import org.apache.oro.text.regex.MalformedPatternException; 027import org.apache.oro.text.regex.MatchResult; 028import org.apache.oro.text.regex.Pattern; 029import org.apache.oro.text.regex.PatternCompiler; 030import org.apache.oro.text.regex.PatternMatcher; 031import org.apache.oro.text.regex.Perl5Compiler; 032import org.apache.oro.text.regex.Perl5Matcher; 033import org.apache.wiki.InternalWikiException; 034import org.apache.wiki.api.core.Attachment; 035import org.apache.wiki.api.core.Context; 036import org.apache.wiki.api.core.ContextEnum; 037import org.apache.wiki.api.core.Engine; 038import org.apache.wiki.api.core.Page; 039import org.apache.wiki.api.exceptions.ProviderException; 040import org.apache.wiki.api.exceptions.RedirectException; 041import org.apache.wiki.api.filters.BasePageFilter; 042import org.apache.wiki.api.providers.WikiProvider; 043import org.apache.wiki.attachment.AttachmentManager; 044import org.apache.wiki.auth.user.UserProfile; 045import org.apache.wiki.pages.PageManager; 046import org.apache.wiki.ui.EditorManager; 047import org.apache.wiki.util.FileUtil; 048import org.apache.wiki.util.HttpUtil; 049import org.apache.wiki.util.TextUtil; 050import org.suigeneris.jrcs.diff.Diff; 051import org.suigeneris.jrcs.diff.DifferentiationFailedException; 052import org.suigeneris.jrcs.diff.Revision; 053import org.suigeneris.jrcs.diff.delta.AddDelta; 054import org.suigeneris.jrcs.diff.delta.ChangeDelta; 055import org.suigeneris.jrcs.diff.delta.DeleteDelta; 056import org.suigeneris.jrcs.diff.delta.Delta; 057import org.suigeneris.jrcs.diff.myers.MyersDiff; 058 059import javax.servlet.http.HttpServletRequest; 060import javax.servlet.http.HttpServletResponse; 061import javax.servlet.jsp.PageContext; 062import java.io.BufferedReader; 063import java.io.IOException; 064import java.io.InputStream; 065import java.io.InputStreamReader; 066import java.io.StringReader; 067import java.io.StringWriter; 068import java.nio.charset.StandardCharsets; 069import java.util.ArrayList; 070import java.util.Arrays; 071import java.util.Collection; 072import java.util.Date; 073import java.util.Iterator; 074import java.util.List; 075import java.util.Properties; 076import java.util.Random; 077import java.util.StringTokenizer; 078import java.util.Vector; 079import java.util.concurrent.ThreadLocalRandom; 080 081 082/** 083 * This is Herb, the JSPWiki spamfilter that can also do choke modifications. 084 * 085 * Parameters: 086 * <ul> 087 * <li>wordlist - Page name where the spamword regexps are found. Use [{SET spamwords='regexp list separated with spaces'}] on 088 * that page. Default is "SpamFilterWordList". 089 * <li>IPlist - Page name where the IP regexps are found. Use [{SET ips='regexp list separated with spaces'}] on 090 * that page. Default is "SpamFilterIPList". 091 * <li>maxpagenamelength - Maximum page name length. Default is 100. 092 * <li>blacklist - The name of an attachment containing the list of spam patterns, one per line. Default is 093 * "SpamFilterWordList/blacklist.txt"</li> 094 * <li>errorpage - The page to which the user is redirected. Has a special variable $msg which states the reason. Default is "RejectedMessage". 095 * <li>pagechangesinminute - How many page changes are allowed/minute. Default is 5.</li> 096 * <li>similarchanges - How many similar page changes are allowed before the host is banned. Default is 2. (since 2.4.72)</li> 097 * <li>bantime - How long an IP address stays on the temporary ban list (default is 60 for 60 minutes).</li> 098 * <li>maxurls - How many URLs can be added to the page before it is considered spam (default is 5)</li> 099 * <li>akismet-apikey - The Akismet API key (see akismet.org)</li> 100 * <li>ignoreauthenticated - If set to "true", all authenticated users are ignored and never caught in SpamFilter</li> 101 * <li>captcha - Sets the captcha technology to use. Current allowed values are "none" and "asirra".</li> 102 * <li>strategy - Sets the filtering strategy to use. If set to "eager", will stop at the first probable 103 * match, and won't consider any other tests. This is the default, as it's considerably lighter. If set to "score", will go through all of the tests 104 * and calculates a score for the spam, which is then compared to a filter level value. 105 * </ul> 106 * 107 * <p>Please see the default editors/plain.jsp for examples on how the SpamFilter integrates 108 * with the editor system.</p> 109 * 110 * <p>Changes by admin users are ignored in any case.</p> 111 * 112 * @since 2.1.112 113 */ 114public class SpamFilter extends BasePageFilter { 115 116 private static final String ATTR_SPAMFILTER_SCORE = "spamfilter.score"; 117 private static final String REASON_REGEXP = "Regexp"; 118 private static final String REASON_IP_BANNED_TEMPORARILY = "IPBannedTemporarily"; 119 private static final String REASON_IP_BANNED_PERMANENTLY = "IPBannedPermanently"; 120 private static final String REASON_BOT_TRAP = "BotTrap"; 121 private static final String REASON_AKISMET = "Akismet"; 122 private static final String REASON_TOO_MANY_URLS = "TooManyUrls"; 123 private static final String REASON_SIMILAR_MODIFICATIONS = "SimilarModifications"; 124 private static final String REASON_TOO_MANY_MODIFICATIONS = "TooManyModifications"; 125 private static final String REASON_PAGENAME_TOO_LONG = "PageNameTooLong"; 126 private static final String REASON_UTF8_TRAP = "UTF8Trap"; 127 128 private static final String LISTVAR = "spamwords"; 129 private static final String LISTIPVAR = "ips"; 130 131 private static final Random RANDOM = ThreadLocalRandom.current(); 132 133 /** The filter property name for specifying the page which contains the list of spamwords. Value is <tt>{@value}</tt>. */ 134 public static final String PROP_WORDLIST = "wordlist"; 135 136 /** The filter property name for specifying the page which contains the list of IPs to ban. Value is <tt>{@value}</tt>. */ 137 public static final String PROP_IPLIST = "IPlist"; 138 139 /** The filter property name for specifying the maximum page name length. Value is <tt>{@value}</tt>. */ 140 public static final String PROP_MAX_PAGENAME_LENGTH = "maxpagenamelength"; 141 142 /** The filter property name for the page to which you are directed if Herb rejects your edit. Value is <tt>{@value}</tt>. */ 143 public static final String PROP_ERRORPAGE = "errorpage"; 144 145 /** The filter property name for specifying how many changes is any given IP address 146 * allowed to do per minute. Value is <tt>{@value}</tt>. 147 */ 148 public static final String PROP_PAGECHANGES = "pagechangesinminute"; 149 150 /** The filter property name for specifying how many similar changes are allowed before a host is banned. Value is <tt>{@value}</tt>. */ 151 public static final String PROP_SIMILARCHANGES = "similarchanges"; 152 153 /** The filter property name for specifying how long a host is banned. Value is <tt>{@value}</tt>.*/ 154 public static final String PROP_BANTIME = "bantime"; 155 156 /** The filter property name for the attachment containing the blacklist. Value is <tt>{@value}</tt>.*/ 157 public static final String PROP_BLACKLIST = "blacklist"; 158 159 /** The filter property name for specifying how many URLs can any given edit contain. Value is <tt>{@value}</tt> */ 160 public static final String PROP_MAXURLS = "maxurls"; 161 162 /** The filter property name for specifying the Akismet API-key. Value is <tt>{@value}</tt>. */ 163 public static final String PROP_AKISMET_API_KEY = "akismet-apikey"; 164 165 /** The filter property name for specifying whether authenticated users should be ignored. Value is <tt>{@value}</tt>. */ 166 public static final String PROP_IGNORE_AUTHENTICATED = "ignoreauthenticated"; 167 168 /** The filter property name for specifying groups allowed to bypass the spam filter. Value is <tt>{@value}</tt>. */ 169 public static final String PROP_ALLOWED_GROUPS = "jspwiki.filters.spamfilter.allowedgroups"; 170 171 /** The filter property name for specifying which captcha technology should be used. Value is <tt>{@value}</tt>. */ 172 public static final String PROP_CAPTCHA = "captcha"; 173 174 /** The filter property name for specifying which filter strategy should be used. Value is <tt>{@value}</tt>. */ 175 public static final String PROP_FILTERSTRATEGY = "strategy"; 176 177 /** The string specifying the "eager" strategy. Value is <tt>{@value}</tt>. */ 178 public static final String STRATEGY_EAGER = "eager"; 179 180 /** The string specifying the "score" strategy. Value is <tt>{@value}</tt>. */ 181 public static final String STRATEGY_SCORE = "score"; 182 183 private static final String URL_REGEXP = "(http://|https://|mailto:)([A-Za-z0-9_/\\.\\+\\?\\#\\-\\@=&;]+)"; 184 185 private String m_forbiddenWordsPage = "SpamFilterWordList"; 186 private String m_forbiddenIPsPage = "SpamFilterIPList"; 187 private String m_pageNameMaxLength = "100"; 188 private String m_errorPage = "RejectedMessage"; 189 private String m_blacklist = "SpamFilterWordList/blacklist.txt"; 190 191 private final PatternMatcher m_matcher = new Perl5Matcher(); 192 private final PatternCompiler m_compiler = new Perl5Compiler(); 193 194 private Collection<Pattern> m_spamPatterns; 195 private Collection<Pattern> m_IPPatterns; 196 197 private Date m_lastRebuild = new Date( 0L ); 198 199 private static final Logger C_SPAMLOG = LogManager.getLogger( "SpamLog" ); 200 private static final Logger LOG = LogManager.getLogger( SpamFilter.class ); 201 202 private final Vector<Host> m_temporaryBanList = new Vector<>(); 203 204 private int m_banTime = 60; // minutes 205 206 private final Vector<Host> m_lastModifications = new Vector<>(); 207 208 /** How many times a single IP address can change a page per minute? */ 209 private int m_limitSinglePageChanges = 5; 210 211 /** How many times can you add the exact same string to a page? */ 212 private int m_limitSimilarChanges = 2; 213 214 /** How many URLs can be added at maximum. */ 215 private int m_maxUrls = 10; 216 217 private Pattern m_urlPattern; 218 private Akismet m_akismet; 219 220 private String m_akismetAPIKey; 221 222 private boolean m_useCaptcha; 223 224 /** The limit at which we consider something to be spam. */ 225 private final int m_scoreLimit = 1; 226 227 /** If set to true, will ignore anyone who is in Authenticated role. */ 228 private boolean m_ignoreAuthenticated; 229 230 /** Groups allowed to bypass the filter */ 231 private String[] m_allowedGroups; 232 233 private boolean m_stopAtFirstMatch = true; 234 235 private static String c_hashName; 236 private static long c_lastUpdate; 237 238 /** The HASH_DELAY value is a maximum amount of time that an user can keep 239 * a session open, because after the value has expired, we will invent a new 240 * hash field name. By default this is {@value} hours, which should be ample 241 * time for someone. 242 */ 243 private static final long HASH_DELAY = 24; 244 245 246 /** 247 * {@inheritDoc} 248 */ 249 @Override 250 public void initialize( final Engine engine, final Properties properties ) { 251 m_forbiddenWordsPage = properties.getProperty( PROP_WORDLIST, m_forbiddenWordsPage ); 252 m_forbiddenIPsPage = properties.getProperty( PROP_IPLIST, m_forbiddenIPsPage); 253 m_pageNameMaxLength = properties.getProperty( PROP_MAX_PAGENAME_LENGTH, m_pageNameMaxLength); 254 m_errorPage = properties.getProperty( PROP_ERRORPAGE, m_errorPage ); 255 m_limitSinglePageChanges = TextUtil.getIntegerProperty( properties, PROP_PAGECHANGES, m_limitSinglePageChanges ); 256 257 m_limitSimilarChanges = TextUtil.getIntegerProperty( properties, PROP_SIMILARCHANGES, m_limitSimilarChanges ); 258 259 m_maxUrls = TextUtil.getIntegerProperty( properties, PROP_MAXURLS, m_maxUrls ); 260 m_banTime = TextUtil.getIntegerProperty( properties, PROP_BANTIME, m_banTime ); 261 m_blacklist = properties.getProperty( PROP_BLACKLIST, m_blacklist ); 262 263 m_ignoreAuthenticated = TextUtil.getBooleanProperty( properties, PROP_IGNORE_AUTHENTICATED, m_ignoreAuthenticated ); 264 m_allowedGroups = StringUtils.split( StringUtils.defaultString( properties.getProperty( PROP_ALLOWED_GROUPS, m_blacklist ) ), ',' ); 265 266 m_useCaptcha = properties.getProperty( PROP_CAPTCHA, "" ).equals("asirra"); 267 268 try { 269 m_urlPattern = m_compiler.compile( URL_REGEXP ); 270 } catch( final MalformedPatternException e ) { 271 LOG.fatal( "Internal error: Someone put in a faulty pattern.", e ); 272 throw new InternalWikiException( "Faulty pattern." , e); 273 } 274 275 m_akismetAPIKey = TextUtil.getStringProperty( properties, PROP_AKISMET_API_KEY, m_akismetAPIKey ); 276 m_stopAtFirstMatch = TextUtil.getStringProperty( properties, PROP_FILTERSTRATEGY, STRATEGY_EAGER ).equals( STRATEGY_EAGER ); 277 278 LOG.info( "# Spam filter initialized. Temporary ban time " + m_banTime + 279 " mins, max page changes/minute: " + m_limitSinglePageChanges ); 280 } 281 282 private static final int REJECT = 0; 283 private static final int ACCEPT = 1; 284 private static final int NOTE = 2; 285 286 private static String log( final Context ctx, final int type, final String source, String message ) { 287 message = TextUtil.replaceString( message, "\r\n", "\\r\\n" ); 288 message = TextUtil.replaceString( message, "\"", "\\\"" ); 289 290 final String uid = getUniqueID(); 291 final String page = ctx.getPage().getName(); 292 final String addr = ctx.getHttpRequest() != null ? HttpUtil.getRemoteAddress( ctx.getHttpRequest() ) : "-"; 293 final String reason; 294 switch( type ) { 295 case REJECT: reason = "REJECTED"; 296 break; 297 case ACCEPT: reason = "ACCEPTED"; 298 break; 299 case NOTE: reason = "NOTE"; 300 break; 301 default: throw new InternalWikiException( "Illegal type " + type ); 302 } 303 C_SPAMLOG.info( reason + " " + source + " " + uid + " " + addr + " \"" + page + "\" " + message ); 304 305 return uid; 306 } 307 308 /** {@inheritDoc} */ 309 @Override 310 public String preSave( final Context context, final String content ) throws RedirectException { 311 cleanBanList(); 312 refreshBlacklists( context ); 313 final Change change = getChange( context, content ); 314 315 if( !ignoreThisUser( context ) ) { 316 checkBanList( context, change ); 317 checkSinglePageChange( context, change ); 318 checkIPList( context ); 319 checkPatternList( context, change ); 320 checkPageName( context); 321 } 322 323 if( !m_stopAtFirstMatch ) { 324 final Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 325 326 if( score != null && score >= m_scoreLimit ) { 327 throw new RedirectException( "Herb says you got too many points", getRedirectPage( context ) ); 328 } 329 } 330 331 log( context, ACCEPT, "-", change.toString() ); 332 return content; 333 } 334 335 private void checkPageName(final Context context ) throws RedirectException { 336 final Page page = context.getPage(); 337 final String pageName = page.getName(); 338 final int maxlength = Integer.parseInt(m_pageNameMaxLength); 339 if ( pageName.length() > maxlength) { 340 // 341 // Spam filter has a match. 342 // 343 344 final String uid = log( context, REJECT, REASON_PAGENAME_TOO_LONG + "(" + m_pageNameMaxLength + ")" , pageName); 345 346 LOG.info("SPAM:PageNameTooLong (" + uid + "). The length of the page name is too large (" + pageName.length() + " , limit is " + m_pageNameMaxLength + ")"); 347 checkStrategy( context, "Herb says '" + pageName + "' is a bad pageName and I trust Herb! (Incident code " + uid + ")" ); 348 349 } 350 } 351 352 private void checkStrategy(final Context context, final String message ) throws RedirectException { 353 if( m_stopAtFirstMatch ) { 354 throw new RedirectException( message, getRedirectPage( context ) ); 355 } 356 357 Integer score = context.getVariable( ATTR_SPAMFILTER_SCORE ); 358 if( score != null ) { 359 score = score + 1; 360 } else { 361 score = 1; 362 } 363 364 context.setVariable( ATTR_SPAMFILTER_SCORE, score ); 365 } 366 367 /** 368 * Parses a list of patterns and returns a Collection of compiled Pattern objects. 369 * 370 * @param source page containing the list of patterns. 371 * @param list list of patterns. 372 * @return A Collection of the Patterns that were found from the lists. 373 */ 374 private Collection< Pattern > parseWordList( final Page source, final String list ) { 375 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 376 377 if( list != null ) { 378 final StringTokenizer tok = new StringTokenizer( list, " \t\n" ); 379 380 while( tok.hasMoreTokens() ) { 381 final String pattern = tok.nextToken(); 382 383 try { 384 compiledpatterns.add( m_compiler.compile( pattern ) ); 385 } catch( final MalformedPatternException e ) { 386 LOG.debug( "Malformed spam filter pattern " + pattern ); 387 source.setAttribute("error", "Malformed spam filter pattern " + pattern); 388 } 389 } 390 } 391 392 return compiledpatterns; 393 } 394 395 /** 396 * Takes a MT-Blacklist -formatted blacklist and returns a list of compiled Pattern objects. 397 * 398 * @param list list of patterns. 399 * @return The parsed blacklist patterns. 400 */ 401 private Collection< Pattern > parseBlacklist( final String list ) { 402 final ArrayList< Pattern > compiledpatterns = new ArrayList<>(); 403 404 if( list != null ) { 405 try { 406 final BufferedReader in = new BufferedReader( new StringReader(list) ); 407 String line; 408 while( (line = in.readLine() ) != null ) { 409 line = line.trim(); 410 if( line.isEmpty() ) continue; // Empty line 411 if( line.startsWith("#") ) continue; // It's a comment 412 413 int ws = line.indexOf( ' ' ); 414 if( ws == -1 ) ws = line.indexOf( '\t' ); 415 if( ws != -1 ) line = line.substring( 0, ws ); 416 417 try { 418 compiledpatterns.add( m_compiler.compile( line ) ); 419 } catch( final MalformedPatternException e ) { 420 LOG.debug( "Malformed spam filter pattern " + line ); 421 } 422 } 423 } catch( final IOException e ) { 424 LOG.info( "Could not read patterns; returning what I got" , e ); 425 } 426 } 427 428 return compiledpatterns; 429 } 430 431 /** 432 * Takes a single page change and performs a load of tests on the content change. An admin can modify anything. 433 * 434 * @param context page Context 435 * @param change page change 436 * @throws RedirectException spam filter rejects the page change. 437 */ 438 private synchronized void checkSinglePageChange(final Context context, final Change change ) 439 throws RedirectException { 440 final HttpServletRequest req = context.getHttpRequest(); 441 442 if( req != null ) { 443 final String addr = HttpUtil.getRemoteAddress( req ); 444 int hostCounter = 0; 445 int changeCounter = 0; 446 447 LOG.debug( "Change is " + change.m_change ); 448 449 final long time = System.currentTimeMillis() - 60*1000L; // 1 minute 450 451 for( final Iterator< Host > i = m_lastModifications.iterator(); i.hasNext(); ) { 452 final Host host = i.next(); 453 454 // Check if this item is invalid 455 if( host.getAddedTime() < time ) { 456 LOG.debug( "Removed host " + host.getAddress() + " from modification queue (expired)" ); 457 i.remove(); 458 continue; 459 } 460 461 // Check if this IP address has been seen before 462 if( host.getAddress().equals( addr ) ) { 463 hostCounter++; 464 } 465 466 // Check, if this change has been seen before 467 if( host.getChange() != null && host.getChange().equals( change ) ) { 468 changeCounter++; 469 } 470 } 471 472 // Now, let's check against the limits. 473 if( hostCounter >= m_limitSinglePageChanges ) { 474 final Host host = new Host( addr, null ); 475 m_temporaryBanList.add( host ); 476 477 final String uid = log( context, REJECT, REASON_TOO_MANY_MODIFICATIONS, change.m_change ); 478 LOG.info( "SPAM:TooManyModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many modifications/minute" ); 479 checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 480 } 481 482 if( changeCounter >= m_limitSimilarChanges ) { 483 final Host host = new Host( addr, null ); 484 m_temporaryBanList.add( host ); 485 486 final String uid = log( context, REJECT, REASON_SIMILAR_MODIFICATIONS, change.m_change ); 487 LOG.info( "SPAM:SimilarModifications (" + uid + "). Added host " + addr + " to temporary ban list for doing too many similar modifications" ); 488 checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code "+uid+")"); 489 } 490 491 // Calculate the number of links in the addition. 492 String tstChange = change.toString(); 493 int urlCounter = 0; 494 while( m_matcher.contains( tstChange,m_urlPattern ) ) { 495 final MatchResult m = m_matcher.getMatch(); 496 tstChange = tstChange.substring( m.endOffset(0) ); 497 urlCounter++; 498 } 499 500 if( urlCounter > m_maxUrls ) { 501 final Host host = new Host( addr, null ); 502 m_temporaryBanList.add( host ); 503 504 final String uid = log( context, REJECT, REASON_TOO_MANY_URLS, change.toString() ); 505 LOG.info( "SPAM:TooManyUrls (" + uid + "). Added host " + addr + " to temporary ban list for adding too many URLs" ); 506 checkStrategy( context, "Herb says you look like a spammer, and I trust Herb! (Incident code " + uid + ")" ); 507 } 508 509 // Check bot trap 510 checkBotTrap( context, change ); 511 512 // Check UTF-8 mangling 513 checkUTF8( context, change ); 514 515 // Do Akismet check. This is good to be the last, because this is the most expensive operation. 516 checkAkismet( context, change ); 517 518 m_lastModifications.add( new Host( addr, change ) ); 519 } 520 } 521 522 523 /** 524 * Checks against the akismet system. 525 * 526 * @param context page Context 527 * @throws RedirectException spam filter rejects the page change. 528 */ 529 private void checkAkismet( final Context context, final Change change ) throws RedirectException { 530 if( m_akismetAPIKey != null ) { 531 if( m_akismet == null ) { 532 LOG.info( "Initializing Akismet spam protection." ); 533 m_akismet = new Akismet( m_akismetAPIKey, context.getEngine().getBaseURL() ); 534 535 if( !m_akismet.verifyAPIKey() ) { 536 LOG.error( "Akismet API key cannot be verified. Please check your config." ); 537 m_akismetAPIKey = null; 538 m_akismet = null; 539 } 540 } 541 542 final HttpServletRequest req = context.getHttpRequest(); 543 544 // Akismet will mark all empty statements as spam, so we'll just ignore them. 545 if( change.m_adds == 0 && change.m_removals > 0 ) { 546 return; 547 } 548 549 if( req != null && m_akismet != null ) { 550 LOG.debug( "Calling Akismet to check for spam..." ); 551 552 final StopWatch sw = new StopWatch(); 553 sw.start(); 554 555 final String ipAddress = HttpUtil.getRemoteAddress( req ); 556 final String userAgent = req.getHeader( "User-Agent" ); 557 final String referrer = req.getHeader( "Referer"); 558 final String permalink = context.getViewURL( context.getPage().getName() ); 559 final String commentType = context.getRequestContext().equals( ContextEnum.PAGE_COMMENT.getRequestContext() ) ? "comment" : "edit"; 560 final String commentAuthor = context.getCurrentUser().getName(); 561 final String commentAuthorEmail = null; 562 final String commentAuthorURL = null; 563 564 final boolean isSpam = m_akismet.commentCheck( ipAddress, 565 userAgent, 566 referrer, 567 permalink, 568 commentType, 569 commentAuthor, 570 commentAuthorEmail, 571 commentAuthorURL, 572 change.toString(), 573 null ); 574 575 sw.stop(); 576 LOG.debug( "Akismet request done in: " + sw ); 577 578 if( isSpam ) { 579 // Host host = new Host( ipAddress, null ); 580 // m_temporaryBanList.add( host ); 581 582 final String uid = log( context, REJECT, REASON_AKISMET, change.toString() ); 583 LOG.info( "SPAM:Akismet (" + uid + "). Akismet thinks this change is spam; added host to temporary ban list." ); 584 checkStrategy( context, "Akismet tells Herb you're a spammer, Herb trusts Akismet, and I trust Herb! (Incident code " + uid + ")" ); 585 } 586 } 587 } 588 } 589 590 /** 591 * Returns a static string which can be used to detect spambots which just wildly fill in all the fields. 592 * 593 * @return A string 594 */ 595 public static String getBotFieldName() { 596 return "submit_auth"; 597 } 598 599 /** 600 * This checks whether an invisible field is available in the request, and whether it's contents are suspected spam. 601 * 602 * @param context page Context 603 * @param change page change 604 * @throws RedirectException spam filter rejects the page change. 605 */ 606 private void checkBotTrap( final Context context, final Change change ) throws RedirectException { 607 final HttpServletRequest request = context.getHttpRequest(); 608 if( request != null ) { 609 final String unspam = request.getParameter( getBotFieldName() ); 610 if( unspam != null && !unspam.isEmpty() ) { 611 final String uid = log( context, REJECT, REASON_BOT_TRAP, change.toString() ); 612 613 LOG.info( "SPAM:BotTrap (" + uid + "). Wildly behaving bot detected." ); 614 checkStrategy( context, "Spamming attempt detected. (Incident code " + uid + ")" ); 615 } 616 } 617 } 618 619 private void checkUTF8( final Context context, final Change change ) throws RedirectException { 620 final HttpServletRequest request = context.getHttpRequest(); 621 if( request != null ) { 622 final String utf8field = request.getParameter( "encodingcheck" ); 623 if( utf8field != null && !utf8field.equals( "\u3041" ) ) { 624 final String uid = log( context, REJECT, REASON_UTF8_TRAP, change.toString() ); 625 626 LOG.info( "SPAM:UTF8Trap (" + uid + "). Wildly posting dumb bot detected." ); 627 checkStrategy( context, "Spamming attempt detected. (Incident code " + uid + ")" ); 628 } 629 } 630 } 631 632 /** Goes through the ban list and cleans away any host which has expired from it. */ 633 private synchronized void cleanBanList() { 634 final long now = System.currentTimeMillis(); 635 for( final Iterator< Host > i = m_temporaryBanList.iterator(); i.hasNext(); ) { 636 final Host host = i.next(); 637 638 if( host.getReleaseTime() < now ) { 639 LOG.debug( "Removed host " + host.getAddress() + " from temporary ban list (expired)" ); 640 i.remove(); 641 } 642 } 643 } 644 645 /** 646 * Checks the ban list if the IP address of the changer is already on it. 647 * 648 * @param context page context 649 * @throws RedirectException spam filter rejects the page change. 650 */ 651 private void checkBanList( final Context context, final Change change ) throws RedirectException { 652 final HttpServletRequest req = context.getHttpRequest(); 653 654 if( req != null ) { 655 final String remote = HttpUtil.getRemoteAddress(req); 656 final long now = System.currentTimeMillis(); 657 658 for( final Host host : m_temporaryBanList ) { 659 if( host.getAddress().equals( remote ) ) { 660 final long timeleft = ( host.getReleaseTime() - now ) / 1000L; 661 662 log( context, REJECT, REASON_IP_BANNED_TEMPORARILY, change.m_change ); 663 checkStrategy( context, 664 "You have been temporarily banned from modifying this wiki. (" + timeleft + " seconds of ban left)" ); 665 } 666 } 667 } 668 } 669 670 /** 671 * If the spam filter notices changes in the black list page, it will refresh them automatically. 672 * 673 * @param context associated WikiContext 674 */ 675 private void refreshBlacklists( final Context context ) { 676 try { 677 boolean rebuild = false; 678 679 // Rebuild, if the spam words page, the attachment or the IP ban page has changed since. 680 final Page sourceSpam = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenWordsPage ); 681 if( sourceSpam != null ) { 682 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || sourceSpam.getLastModified().after( m_lastRebuild ) ) { 683 rebuild = true; 684 } 685 } 686 687 final Attachment att = context.getEngine().getManager( AttachmentManager.class ).getAttachmentInfo( context, m_blacklist ); 688 if( att != null ) { 689 if( m_spamPatterns == null || m_spamPatterns.isEmpty() || att.getLastModified().after( m_lastRebuild ) ) { 690 rebuild = true; 691 } 692 } 693 694 final Page sourceIPs = context.getEngine().getManager( PageManager.class ).getPage( m_forbiddenIPsPage ); 695 if( sourceIPs != null ) { 696 if( m_IPPatterns == null || m_IPPatterns.isEmpty() || sourceIPs.getLastModified().after( m_lastRebuild ) ) { 697 rebuild = true; 698 } 699 } 700 701 // Do the actual rebuilding. For simplicity's sake, we always rebuild the complete filter list regardless of what changed. 702 if( rebuild ) { 703 m_lastRebuild = new Date(); 704 m_spamPatterns = parseWordList( sourceSpam, ( sourceSpam != null ) ? sourceSpam.getAttribute( LISTVAR ) : null ); 705 706 LOG.info( "Spam filter reloaded - recognizing " + m_spamPatterns.size() + " patterns from page " + m_forbiddenWordsPage ); 707 708 m_IPPatterns = parseWordList( sourceIPs, ( sourceIPs != null ) ? sourceIPs.getAttribute( LISTIPVAR ) : null ); 709 LOG.info( "IP filter reloaded - recognizing " + m_IPPatterns.size() + " patterns from page " + m_forbiddenIPsPage ); 710 711 if( att != null ) { 712 final InputStream in = context.getEngine().getManager( AttachmentManager.class ).getAttachmentStream(att); 713 final StringWriter out = new StringWriter(); 714 FileUtil.copyContents( new InputStreamReader( in, StandardCharsets.UTF_8 ), out ); 715 final Collection< Pattern > blackList = parseBlacklist( out.toString() ); 716 LOG.info( "...recognizing additional " + blackList.size() + " patterns from blacklist " + m_blacklist ); 717 m_spamPatterns.addAll( blackList ); 718 } 719 } 720 } catch( final IOException ex ) { 721 LOG.info( "Unable to read attachment data, continuing...", ex ); 722 } catch( final ProviderException ex ) { 723 LOG.info( "Failed to read spam filter attachment, continuing...", ex ); 724 } 725 } 726 727 /** 728 * Does a check against a known pattern list. 729 * 730 * @param context page Context 731 * @param change page change 732 * @throws RedirectException spam filter rejects the page change. 733 */ 734 private void checkPatternList( final Context context, final Change change ) throws RedirectException { 735 // If we have no spam patterns defined, or we're trying to save the page containing the patterns, just return. 736 if( m_spamPatterns == null || context.getPage().getName().equals( m_forbiddenWordsPage ) ) { 737 return; 738 } 739 740 String ch = change.toString(); 741 if( context.getHttpRequest() != null ) { 742 ch += HttpUtil.getRemoteAddress( context.getHttpRequest() ); 743 } 744 745 for( final Pattern p : m_spamPatterns ) { 746 // LOG.debug("Attempting to match page contents with "+p.getPattern()); 747 748 if( m_matcher.contains( ch, p ) ) { 749 // Spam filter has a match. 750 final String uid = log( context, REJECT, REASON_REGEXP + "(" + p.getPattern() + ")", ch ); 751 752 LOG.info( "SPAM:Regexp (" + uid + "). Content matches the spam filter '" + p.getPattern() + "'" ); 753 checkStrategy( context, "Herb says '" + p.getPattern() + "' is a bad spam word and I trust Herb! (Incident code " + uid + ")" ); 754 } 755 } 756 } 757 758 759 /** 760 * Does a check against a pattern list of IPs. 761 * 762 * @param context page context 763 * @throws RedirectException spam filter rejects the page change. 764 */ 765 private void checkIPList( final Context context ) throws RedirectException { 766 // If we have no IP patterns defined, or we're trying to save the page containing the IP patterns, just return. 767 if( m_IPPatterns == null || context.getPage().getName().equals( m_forbiddenIPsPage ) ) { 768 return; 769 } 770 771 final String remoteIP = HttpUtil.getRemoteAddress( context.getHttpRequest() ); 772 LOG.info("Attempting to match remoteIP " + remoteIP + " against " + m_IPPatterns.size() + " patterns"); 773 774 for( final Pattern p : m_IPPatterns ) { 775 LOG.debug("Attempting to match remoteIP with " + p.getPattern()); 776 777 if( m_matcher.contains( remoteIP, p ) ) { 778 779 // IP filter has a match. 780 // 781 final String uid = log( context, REJECT, REASON_IP_BANNED_PERMANENTLY + "(" + p.getPattern() + ")", remoteIP ); 782 783 LOG.info( "SPAM:IPBanList (" + uid + "). remoteIP matches the IP filter '" + p.getPattern() + "'" ); 784 checkStrategy( context, "Herb says '" + p.getPattern() + "' is a banned IP and I trust Herb! (Incident code " + uid + ")" ); 785 } 786 } 787 } 788 789 private void checkPatternList( final Context context, final String change ) throws RedirectException { 790 final Change c = new Change(); 791 c.m_change = change; 792 checkPatternList( context, c ); 793 } 794 795 /** 796 * Creates a simple text string describing the added content. 797 * 798 * @param context page context 799 * @param newText added content 800 * @return Empty string, if there is no change. 801 */ 802 private static Change getChange( final Context context, final String newText ) { 803 final Page page = context.getPage(); 804 final StringBuffer change = new StringBuffer(); 805 final Engine engine = context.getEngine(); 806 // Get current page version 807 808 final Change ch = new Change(); 809 810 try { 811 final String oldText = engine.getManager( PageManager.class ).getPureText( page.getName(), WikiProvider.LATEST_VERSION ); 812 final String[] first = Diff.stringToArray( oldText ); 813 final String[] second = Diff.stringToArray( newText ); 814 final Revision rev = Diff.diff( first, second, new MyersDiff() ); 815 816 if( rev == null || rev.size() == 0 ) { 817 return ch; 818 } 819 820 for( int i = 0; i < rev.size(); i++ ) { 821 final Delta d = rev.getDelta( i ); 822 823 if( d instanceof AddDelta ) { 824 d.getRevised().toString( change, "", "\r\n" ); 825 ch.m_adds++; 826 827 } else if( d instanceof ChangeDelta ) { 828 d.getRevised().toString( change, "", "\r\n" ); 829 ch.m_adds++; 830 831 } else if( d instanceof DeleteDelta ) { 832 ch.m_removals++; 833 } 834 } 835 } catch( final DifferentiationFailedException e ) { 836 LOG.error( "Diff failed", e ); 837 } 838 839 // Don't forget to include the change note, too 840 final String changeNote = page.getAttribute( Page.CHANGENOTE ); 841 if( changeNote != null ) { 842 change.append( "\r\n" ); 843 change.append( changeNote ); 844 } 845 846 // And author as well 847 if( page.getAuthor() != null ) { 848 change.append( "\r\n" ).append( page.getAuthor() ); 849 } 850 851 ch.m_change = change.toString(); 852 return ch; 853 } 854 855 /** 856 * Returns true, if this user should be ignored. For example, admin users. 857 * 858 * @param context page context 859 * @return True, if this user should be ignored. 860 */ 861 private boolean ignoreThisUser( final Context context ) { 862 if( context.hasAdminPermissions() ) { 863 return true; 864 } 865 866 final List< String > groups = Arrays.asList( m_allowedGroups ); 867 if( Arrays.stream( context.getWikiSession().getRoles() ).anyMatch( role -> groups.contains( role.getName() ) ) ) { 868 return true; 869 } 870 871 if( m_ignoreAuthenticated && context.getWikiSession().isAuthenticated() ) { 872 return true; 873 } 874 875 return context.getVariable("captcha") != null; 876 } 877 878 /** 879 * Returns a random string of six uppercase characters. 880 * 881 * @return A random string 882 */ 883 private static String getUniqueID() { 884 final StringBuilder sb = new StringBuilder(); 885 for( int i = 0; i < 6; i++ ) { 886 final char x = ( char )( 'A' + RANDOM.nextInt( 26 ) ); 887 sb.append( x ); 888 } 889 890 return sb.toString(); 891 } 892 893 /** 894 * Returns a page to which we shall redirect, based on the current value of the "captcha" parameter. 895 * 896 * @param ctx WikiContext 897 * @return An URL to redirect to 898 */ 899 private String getRedirectPage( final Context ctx ) { 900 if( m_useCaptcha ) { 901 return ctx.getURL( ContextEnum.PAGE_NONE.getRequestContext(), "Captcha.jsp", "page= " +ctx.getEngine().encodeName( ctx.getPage().getName() ) ); 902 } 903 904 return ctx.getURL( ContextEnum.PAGE_VIEW.getRequestContext(), m_errorPage ); 905 } 906 907 /** 908 * Checks whether the UserProfile matches certain checks. 909 * 910 * @param profile The profile to check 911 * @param context The WikiContext 912 * @return False, if this userprofile is suspect and should not be allowed to be added. 913 * @since 2.6.1 914 */ 915 public boolean isValidUserProfile( final Context context, final UserProfile profile ) { 916 try { 917 checkPatternList( context, profile.getEmail() ); 918 checkPatternList( context, profile.getFullname() ); 919 checkPatternList( context, profile.getLoginName() ); 920 } catch( final RedirectException e ) { 921 LOG.info("Detected attempt to create a spammer user account (see above for rejection reason)"); 922 return false; 923 } 924 925 return true; 926 } 927 928 /** 929 * This method is used to calculate an unique code when submitting the page to detect edit conflicts. 930 * It currently incorporates the last-modified date of the page, and the IP address of the submitter. 931 * 932 * @param page The WikiPage under edit 933 * @param request The HTTP Request 934 * @since 2.6 935 * @return A hash value for this page and session 936 */ 937 public static String getSpamHash( final Page page, final HttpServletRequest request ) { 938 long lastModified = 0; 939 940 if( page.getLastModified() != null ) { 941 lastModified = page.getLastModified().getTime(); 942 } 943 final long remote = HttpUtil.getRemoteAddress( request ).hashCode(); 944 945 return Long.toString( lastModified ^ remote ); 946 } 947 948 /** 949 * Returns the name of the hash field to be used in this request. The value is unique per session, and once 950 * the session has expired, you cannot edit anymore. 951 * 952 * @param request The page request 953 * @return The name to be used in the hash field 954 * @since 2.6 955 */ 956 public static String getHashFieldName( final HttpServletRequest request ) { 957 String hash = null; 958 959 if( request.getSession() != null ) { 960 hash = ( String )request.getSession().getAttribute( "_hash" ); 961 962 if( hash == null ) { 963 hash = c_hashName; 964 request.getSession().setAttribute( "_hash", hash ); 965 } 966 } 967 968 if( c_hashName == null || c_lastUpdate < ( System.currentTimeMillis() - HASH_DELAY * 60 * 60 * 1000 ) ) { 969 c_hashName = getUniqueID().toLowerCase(); 970 c_lastUpdate = System.currentTimeMillis(); 971 } 972 973 return hash != null ? hash : c_hashName; 974 } 975 976 977 /** 978 * This method checks if the hash value is still valid, i.e. if it exists at all. This can occur in two cases: 979 * either this is a spam bot which is not adaptive, or it is someone who has been editing one page for too long, 980 * and their session has expired. 981 * <p> 982 * This method puts a redirect to the http response field to page "SessionExpired" and logs the incident in 983 * the spam log (it may or may not be spam, but it's rather likely that it is). 984 * 985 * @param context The WikiContext 986 * @param pageContext The JSP PageContext. 987 * @return True, if hash is okay. False, if hash is not okay, and you need to redirect. 988 * @throws IOException If redirection fails 989 * @since 2.6 990 */ 991 public static boolean checkHash( final Context context, final PageContext pageContext ) throws IOException { 992 final String hashName = getHashFieldName( (HttpServletRequest)pageContext.getRequest() ); 993 if( pageContext.getRequest().getParameter(hashName) == null ) { 994 if( pageContext.getAttribute( hashName ) == null ) { 995 final Change change = getChange( context, EditorManager.getEditedText( pageContext ) ); 996 log( context, REJECT, "MissingHash", change.m_change ); 997 998 final String redirect = context.getURL( ContextEnum.PAGE_VIEW.getRequestContext(),"SessionExpired" ); 999 ( ( HttpServletResponse )pageContext.getResponse() ).sendRedirect( redirect ); 1000 return false; 1001 } 1002 } 1003 1004 return true; 1005 } 1006 1007 /** 1008 * This helper method adds all the input fields to your editor that the SpamFilter requires 1009 * to check for spam. This <i>must</i> be in your editor form if you intend to use the SpamFilter. 1010 * 1011 * @param pageContext The PageContext 1012 * @return A HTML string which contains input fields for the SpamFilter. 1013 */ 1014 public static String insertInputFields( final PageContext pageContext ) { 1015 final Context ctx = Context.findContext( pageContext ); 1016 final Engine engine = ctx.getEngine(); 1017 final StringBuilder sb = new StringBuilder(); 1018 if( engine.getContentEncoding().equals( StandardCharsets.UTF_8 ) ) { 1019 sb.append( "<input name='encodingcheck' type='hidden' value='\u3041' />\n" ); 1020 } 1021 1022 return sb.toString(); 1023 } 1024 1025 /** 1026 * A local class for storing host information. 1027 */ 1028 private class Host { 1029 1030 private final long m_addedTime = System.currentTimeMillis(); 1031 private final long m_releaseTime; 1032 private final String m_address; 1033 private final Change m_change; 1034 1035 public String getAddress() { 1036 return m_address; 1037 } 1038 1039 public long getReleaseTime() { 1040 return m_releaseTime; 1041 } 1042 1043 public long getAddedTime() { 1044 return m_addedTime; 1045 } 1046 1047 public Change getChange() { 1048 return m_change; 1049 } 1050 1051 public Host( final String ipaddress, final Change change ) { 1052 m_address = ipaddress; 1053 m_change = change; 1054 m_releaseTime = System.currentTimeMillis() + m_banTime * 60 * 1000L; 1055 } 1056 1057 } 1058 1059 private static class Change { 1060 1061 public String m_change; 1062 public int m_adds; 1063 public int m_removals; 1064 1065 @Override 1066 public String toString() { 1067 return m_change; 1068 } 1069 1070 @Override 1071 public boolean equals( final Object o ) { 1072 if( o instanceof Change ) { 1073 return m_change.equals( ( ( Change )o ).m_change ); 1074 } 1075 return false; 1076 } 1077 1078 @Override 1079 public int hashCode() { 1080 return m_change.hashCode() + 17; 1081 } 1082 1083 } 1084 1085}