nsBayesianFilter.h

comm-central/mailnews/extensions/bayesian-spam-filter/nsBayesianFilter.h

Enable keyboard shortcuts

Source code

Go to source file

Revision control

Copy as Markdown

Other Tools

HG Web

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */

/* This Source Code Form is subject to the terms of the Mozilla Public

 * License, v. 2.0. If a copy of the MPL was not distributed with this

 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef nsBayesianFilter_h__

#define nsBayesianFilter_h__

#include <stdio.h>

#include "nsCOMPtr.h"

#include "nsIMsgFilterPlugin.h"

#include "PLDHashTable.h"

#include "nsITimer.h"

#include "nsTArray.h"

#include "nsString.h"

#include "nsWeakReference.h"

#include "nsIObserver.h"

#include "nsHashPropertyBag.h"

#include "mozilla/intl/WordBreaker.h"

#include "mozilla/ArenaAllocator.h"

#define DEFAULT_MIN_INTERVAL_BETWEEN_WRITES 15 * 60 * 1000

struct Token;

class TokenEnumeration;

class TokenAnalyzer;

class nsIMsgWindow;

class nsIUTF8StringEnumerator;

struct BaseToken;

struct CorpusToken;

/**

 * Helper class to enumerate Token objects in a PLDHashTable

 * safely and without copying (see bugzilla #174859). The

 * enumeration is safe to use until an Add()

 * or Remove() is performed on the table.

*/

class TokenEnumeration {

 public:

  explicit TokenEnumeration(PLDHashTable* table);

  bool hasMoreTokens();

  BaseToken* nextToken();

 private:

  PLDHashTable::Iterator mIterator;

};

// A trait is some aspect of a message, like being junk or tagged as

// Personal, that the statistical classifier should track. The Trait

// structure is a per-token representation of information pertaining to

// a message trait.

//

// Traits per token are maintained as a linked list.

//

struct TraitPerToken {

  uint32_t mId;        // identifying number for a trait

  uint32_t mCount;     // count of messages with this token and trait

  uint32_t mNextLink;  // index in mTraitStore for the next trait, or 0

                       // for none

  TraitPerToken(uint32_t aId, uint32_t aCount);  // inititializer

};

// An Analysis is the statistical results for a particular message, a

// particular token, and for a particular pair of trait/antitrait, that

// is then used in subsequent analysis to score the message.

//

// Analyses per token are maintained as a linked list.

//

struct AnalysisPerToken {

  uint32_t mTraitIndex;  // index representing a protrait/antitrait pair.

                         // So if we are analyzing 3 different traits, then

                         // the first trait is 0, the second 1, etc.

  double mDistance;      // absolute value of mProbability - 0.5

  double mProbability;   // relative indicator of match of trait to token

  uint32_t mNextLink;    // index in mAnalysisStore for the Analysis object

                         // for the next trait index, or 0 for none.

  // initializer

  AnalysisPerToken(uint32_t aTraitIndex, double aDistance, double aProbability);

};

class TokenHash {

 public:

  virtual ~TokenHash();

/**

   * Clears out the previous message tokens.

*/

  nsresult clearTokens();

  uint32_t countTokens();

  TokenEnumeration getTokens();

  BaseToken* add(const char* word);

 protected:

  explicit TokenHash(uint32_t entrySize);

  mozilla::ArenaAllocator<16384, 2> mWordPool;

  uint32_t mEntrySize;

  PLDHashTable mTokenTable;

  char* copyWord(const char* word, uint32_t len);

  BaseToken* get(const char* word);

};

class Tokenizer : public TokenHash {

 public:

  Tokenizer();

  ~Tokenizer();

  Token* get(const char* word);

  // The training set keeps an occurrence count on each word. This count

  // is supposed to count the # of messages it occurs in.

  // When add/remove is called while tokenizing a message and NOT the training

  // set,

//

  Token* add(const char* word, uint32_t count = 1);

  Token* copyTokens();

  void tokenize(const char* text);

/**

   *  Creates specific tokens based on the mime headers for the message being

   * tokenized

*/

  void tokenizeHeaders(nsTArray<nsCString>& aHeaderNames,

                       nsTArray<nsCString>& aHeaderValues);

  void tokenizeAttachments(nsTArray<RefPtr<nsIPropertyBag2>>& attachments);

  nsCString mBodyDelimiters;    // delimiters for body tokenization

  nsCString mHeaderDelimiters;  // delimiters for header tokenization

  // arrays of extra headers to tokenize / to not tokenize

  nsTArray<nsCString> mEnabledHeaders;

  nsTArray<nsCString> mDisabledHeaders;

  // Delimiters used in tokenizing a particular header.

  // Parallel array to mEnabledHeaders

  nsTArray<nsCString> mEnabledHeadersDelimiters;

  bool mCustomHeaderTokenization;  // Are there any preference-set tokenization

                                   // customizations?

  uint32_t mMaxLengthForToken;     // maximum length of a token

  // should we convert iframe to div during tokenization?

  bool mIframeToDiv;

 private:

  void tokenize_ascii_word(char* word);

  void tokenize_japanese_word(char* chunk);

  inline void addTokenForHeader(const char* aTokenPrefix, nsACString& aValue,

                                bool aTokenizeValue = false,

                                const char* aDelimiters = nullptr);

  nsresult stripHTML(const nsAString& inString, nsAString& outString);

  // helper function to escape \n, \t, etc from a CString

  void UnescapeCString(nsCString& aCString);

  nsresult ScannerNext(const char16_t* text, int32_t length, int32_t pos,

                       bool isLastBuffer, int32_t* begin, int32_t* end,

                       bool* _retval);

};

/**

 * Implements storage of a collection of message tokens and counts for

 * a corpus of classified messages

*/

class CorpusStore : public TokenHash {

 public:

  CorpusStore();

  ~CorpusStore();

/**

   * retrieve the token structure for a particular string

   * @param word  the character representation of the token

   * @return      token structure containing counts, null if not found

*/

  CorpusToken* get(const char* word);

/**

   * add tokens to the storage, or increment counts if already exists.

   * @param aTokenizer tokenizer for the list of tokens to remember

   * @param aTraitId   id for the trait whose counts will be remembered

   * @param aCount     number of new messages represented by the token list

*/

  void rememberTokens(Tokenizer& aTokenizer, uint32_t aTraitId,

                      uint32_t aCount);

/**

   * decrement counts for tokens in the storage, removing if all counts

   * are zero

   * @param aTokenizer tokenizer for the list of tokens to forget

   * @param aTraitId   id for the trait whose counts will be removed

   * @param aCount     number of messages represented by the token list

*/

  void forgetTokens(Tokenizer& aTokenizer, uint32_t aTraitId, uint32_t aCount);

/**

   * write the corpus information to file storage

   * @param aMaximumTokenCount  prune tokens if number of tokens exceeds

   *                            this value.  == 0  for no pruning

*/

  void writeTrainingData(uint32_t aMaximumTokenCount);

/**

   * read the corpus information from file storage

*/

  void readTrainingData();

/**

   * delete the local corpus storage file and data

*/

  nsresult resetTrainingData();

/**

   * get the count of messages whose tokens are stored that are associated

   * with a trait

   * @param aTraitId  identifier for the trait

   * @return          number of messages for that trait

*/

  uint32_t getMessageCount(uint32_t aTraitId);

/**

   * set the count of messages whose tokens are stored that are associated

   * with a trait

   * @param aTraitId  identifier for the trait

   * @param aCount    number of messages for that trait

*/

  void setMessageCount(uint32_t aTraitId, uint32_t aCount);

/**

   * get the count of messages associated with a particular token and trait

   * @param  token     the token string and associated counts

   * @param  aTraitId  identifier for the trait

*/

  uint32_t getTraitCount(CorpusToken* token, uint32_t aTraitId);

/**

   * Add (or remove) data from a particular file to the corpus data.

   * @param aFile       the file with the data, in the format:

   *                    Format of the trait file for version 1:

   *                    [0xFCA93601]  (the 01 is the version)

   *                    for each trait to write:

   *                    [id of trait to write] (0 means end of list)

   *                    [number of messages per trait]

   *                    for each token with non-zero count

   *                    [count]

   *                    [length of word]word

   * @param aIsAdd      should the data be added, or removed? true if adding,

   *                    else removing.

   * @param aFromTraits array of trait ids used in aFile. If aFile contains

   *                    trait ids that are not in this array, they are not

   *                    remapped, but assumed to be local trait ids.

   * @param aToTraits   array of trait ids, corresponding to elements of

   *                    aFromTraits, that represent the local trait ids to be

   *                    used in storing data from aFile into the local corpus.

*/

  nsresult UpdateData(nsIFile* aFile, bool aIsAdd,

                      const nsTArray<uint32_t>& aFromTraits,

                      const nsTArray<uint32_t>& aToTraits);

/**

   * remove all counts (message and tokens) for a trait id

   * @param aTrait  trait id for the trait to remove

*/

  nsresult ClearTrait(uint32_t aTrait);

 protected:

/**

   * return the local corpus storage file for junk traits

*/

  nsresult getTrainingFile(nsIFile** aFile);

/**

   * return the local corpus storage file for non-junk traits

*/

  nsresult getTraitFile(nsIFile** aFile);

/**

   * read token strings from the data file

   * @param stream     file stream with token data

   * @param fileSize   file size

   * @param aTraitId   id for the trait whose counts will be read

   * @param aIsAdd     true to add the counts, false to remove them

   * @return           true if successful, false if error

*/

  bool readTokens(FILE* stream, int64_t fileSize, uint32_t aTraitId,

                  bool aIsAdd);

/**

   * write token strings to the data file

*/

  bool writeTokens(FILE* stream, bool shrink, uint32_t aTraitId);

/**

   * remove counts for a token string

*/

  void remove(const char* word, uint32_t aTraitId, uint32_t aCount);

/**

   * add counts for a token string, adding the token string if new

*/

  CorpusToken* add(const char* word, uint32_t aTraitId, uint32_t aCount);

/**

   * change counts in a trait in the traits array, adding the trait if needed

*/

  nsresult updateTrait(CorpusToken* token, uint32_t aTraitId,

                       int32_t aCountChange);

  nsCOMPtr<nsIFile> mTrainingFile;      // file used to store junk training data

  nsCOMPtr<nsIFile> mTraitFile;         // file used to store non-junk

                                        // training data

  nsTArray<TraitPerToken> mTraitStore;  // memory for linked-list of counts

  uint32_t mNextTraitIndex;             // index in mTraitStore to first empty

                                        // TraitPerToken

  nsTArray<uint32_t> mMessageCounts;    // count of messages per trait

                                        // represented in the store

  nsTArray<uint32_t> mMessageCountsId;  // Parallel array to mMessageCounts,

                                        // with the corresponding trait ID

};

class nsBayesianFilter : public nsIJunkMailPlugin,

                         nsIMsgCorpus,

                         nsIObserver,

                         nsSupportsWeakReference {

 public:

  NS_DECL_ISUPPORTS

  NS_DECL_NSIMSGFILTERPLUGIN

  NS_DECL_NSIJUNKMAILPLUGIN

  NS_DECL_NSIMSGCORPUS

  NS_DECL_NSIOBSERVER

  nsBayesianFilter();

  nsresult Init();

  nsresult tokenizeMessage(const nsACString& messageURI,

                           nsIMsgWindow* aMsgWindow, TokenAnalyzer* analyzer);

  void classifyMessage(Tokenizer& tokens, const nsACString& messageURI,

                       nsIJunkMailClassificationListener* listener);

  void classifyMessage(Tokenizer& tokenizer, const nsACString& messageURI,

                       nsTArray<uint32_t>& aProTraits,

                       nsTArray<uint32_t>& aAntiTraits,

                       nsIJunkMailClassificationListener* listener,

                       nsIMsgTraitClassificationListener* aTraitListener,

                       nsIMsgTraitDetailListener* aDetailListener);

  void observeMessage(Tokenizer& tokens, const nsACString& messageURI,

                      nsTArray<uint32_t>& oldClassifications,

                      nsTArray<uint32_t>& newClassifications,

                      nsIJunkMailClassificationListener* listener,

                      nsIMsgTraitClassificationListener* aTraitListener);

 protected:

  virtual ~nsBayesianFilter();

  static void TimerCallback(nsITimer* aTimer, void* aClosure);

  CorpusStore mCorpus;

  double mJunkProbabilityThreshold;

  int32_t mMaximumTokenCount;

  bool mTrainingDataDirty;

  int32_t mMinFlushInterval;  // in milliseconds, must be positive

                              // and not too close to 0

  nsCOMPtr<nsITimer> mTimer;

  // index in mAnalysisStore for first empty AnalysisPerToken

  uint32_t mNextAnalysisIndex;

  // memory for linked list of AnalysisPerToken objects

  nsTArray<AnalysisPerToken> mAnalysisStore;

/**

   * Determine the location in mAnalysisStore where the AnalysisPerToken

   * object for a particular token and trait is stored

*/

  uint32_t getAnalysisIndex(Token& token, uint32_t aTraitIndex);

/**

   * Set the value of the AnalysisPerToken object for a particular

   * token and trait

*/

  nsresult setAnalysis(Token& token, uint32_t aTraitIndex, double aDistance,

                       double aProbability);

};

#endif  // _nsBayesianFilter_h__