Module textcl.outliers_detection

This module contains functions for performing unsupervised outlier detection.

Expand source code
"""
This module contains functions for performing unsupervised outlier detection.
"""

from __future__ import division, print_function
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing
import nltk
from nltk.corpus import stopwords
from scipy import stats


class _R_pca:
    """
    RPCA implementation from [this](https://github.com/dganguli/robust-pca) source
    """
    def __init__(self, D, mu=None, lmbda=None):
        self.D = D
        self.S = np.zeros(self.D.shape)
        self.Y = np.zeros(self.D.shape)

        if mu:
            self.mu = mu
        else:
            self.mu = np.prod(self.D.shape) / (4 * np.linalg.norm(self.D, ord=1))

        self.mu_inv = 1 / self.mu

        if lmbda:
            self.lmbda = lmbda
        else:
            self.lmbda = 1 / np.sqrt(np.max(self.D.shape))

    @staticmethod
    def frobenius_norm(M):
        return np.linalg.norm(M, ord='fro')

    @staticmethod
    def shrink(M, tau):
        return np.sign(M) * np.maximum((np.abs(M) - tau), np.zeros(M.shape))

    def svd_threshold(self, M, tau):
        U, S, V = np.linalg.svd(M, full_matrices=False)
        return np.dot(U, np.dot(np.diag(self.shrink(S, tau)), V))

    def fit(self, tol=None, max_iter=1000, iter_print=100):
        iter = 0
        err = np.Inf
        Sk = self.S
        Yk = self.Y
        Lk = np.zeros(self.D.shape)

        if tol:
            _tol = tol
        else:
            _tol = 1E-7 * self.frobenius_norm(self.D)

        #this loop implements the principal component pursuit (PCP) algorithm
        #located in the table on page 29 of https://arxiv.org/pdf/0912.3599.pdf
        while (err > _tol) and iter < max_iter:
            Lk = self.svd_threshold(
                self.D - Sk + self.mu_inv * Yk, self.mu_inv)                            #this line implements step 3
            Sk = self.shrink(
                self.D - Lk + (self.mu_inv * Yk), self.mu_inv * self.lmbda)             #this line implements step 4
            Yk = Yk + self.mu * (self.D - Lk - Sk)                                      #this line implements step 5
            err = self.frobenius_norm(self.D - Lk - Sk)
            iter += 1

        self.L = Lk
        self.S = Sk
        return Lk, Sk


# function to use rpca
def rpca_implementation(bag_of_words):
    """
    Function to use RPCA for getting outlier_matrix. It uses [this](https://github.com/dganguli/robust-pca) RPCA implementation

    ---

    **Arguments**\n
    `bag_of_words` (array of shape (n_documents, n_words)): document-word matrix.

    ---

    **Returns**\n
    `outlier_matrix` (array of shape (n_documents, n_words)): outliers matrix.
    """

    rpca = _R_pca(np.array(bag_of_words))
    _, outlier_matrix = rpca.fit(max_iter=10000, iter_print=100)
    return outlier_matrix


def tonmf(A, k, alpha, beta):
    """
    Function to use TONMF for getting outlier_matrix. Solves the equation
    A = ||A-Z-WH||_F^2 + alpha ||Z||_2,1 + beta ||H||_1
    A is a sparse matrix of size mxn
    Z matrix is the outlier matrix.

    ---

    **Arguments**\n
    `A` (array of shape (n_documents, n_words)): document-word matrix.\n
    `k` (int): rank.\n
    `alpha` (float): defines the weight for the outlier matrix Z.\n
    `beta` (float): approximation parameter.

    ---

    **Returns**\n
    `Z` (array of shape (n_documents, n_words)): Outlier matrix.\n
    `W` (array of shape (n_documents, rank)): Term-Topic matrix.\n
    `H` (array of shape (rank, n_words)): Topic-Document matrix.\n
    `errChange` (array): errors history.
    """

    m, n = A.shape
    # numIterations is used for convergence of W,H
    numIterationsWH = 10
    numIterations = 10
    # first fix W,H and solve Z
    W = np.random.random((m, k)).reshape([k, m]).T
    H = np.random.random((k, n)).reshape([n, k]).T
    D = A-np.dot(W, H)
    currentIteration = 1
    Z = np.zeros((m, n))
    prevErr = np.linalg.norm(D, ord='fro') + beta*np.linalg.norm(H, ord=1)
    currentErr = prevErr-1
    errChange = np.zeros((numIterations+1, 1))
    # convergence is when A \approx WH
    while currentIteration < numIterations:
        colnormdi = np.sqrt(np.sum(np.square(D), axis = 0))
        colnormdi_factor = colnormdi-alpha
        colnormdi_factor[colnormdi_factor < 0] = 0
        Z = np.divide(D, colnormdi)
        Z = np.array(Z) * np.array(colnormdi_factor)
        D = A-Z
        W, H, _ = _hals(D, W, H, k, 1e-6, numIterationsWH, beta)
        D = A-np.dot(W, H)
        if currentIteration > 1:
            prevErr = currentErr
        errChange[currentIteration] = prevErr
        currentErr = np.linalg.norm(D-Z, ord='fro') + alpha * np.sum(np.sqrt(np.sum(np.square(Z)))) + beta * np.linalg.norm(H, ord=1)
        currentIteration = currentIteration + 1
    errChange[currentIteration]=currentErr
    return Z, W, H, errChange


def _hals(A, Winit, Hinit, k, tolerance, numIterations, beta):
    """
    Function to minimize F(W,H) with respect to each column of W and H.
    solves A=WH
    A = mxn matrix
    W = mxk
    H = kxn
    k = low rank
    implementation of the algorithm2 from
    http://www.bsp.brain.riken.jp/publications/2009/Cichocki-Phan-IEICE_col.pdf

    ---

    **Arguments** \n
    `A` (array of shape (n_documents, n_words)): document-word matrix. \n
    `Winit`(array of shape (n_documents, rank)): initial Term-Topic matrix. \n
    `Hinit` (array of shape (rank, n_words)): initial Topic-Document matrix. \n
    `k` (int): rank.\n
    `tolerance` (float): stopping criteria.\n
    `numIterations` (int): max number of iterations. The algorithm stops when met the tolerance or numIterations.\n
    `beta` (float): approximation parameter.

    ---


    **Returns** \n
    `W` (array of shape (n_documents, rank)): Term-Topic matrix. \n
    `H` (array of shape (rank, n_words)): Topic-Document matrix. \n
    `errChange` (array): errors history.
    """
    W = Winit
    H = Hinit
    prevError = np.linalg.norm(A-np.dot(W, H), ord='fro')
    currError = prevError+1
    currentIteration = 1
    errChange = np.zeros((1, numIterations+1))[0]
    while abs(currError-prevError) > tolerance and currentIteration < numIterations:
        # update W
        AHt = np.dot(A, np.transpose(H))
        HHt = np.dot(H, np.transpose(H))
        # to avoid divide by zero error
        HHtDiag = np.array(np.diag(HHt))
        HHtDiag[HHtDiag == 0] = np.finfo(float).eps
        for x in range(0, k, 1):
            Wx = W[:, x] + (np.array(np.transpose(AHt[:, x]))[0]-np.dot(W, HHt[:, x]))/HHtDiag[x]
            Wx[Wx < np.finfo(float).eps] = np.finfo(float).eps
            W[:, x] = Wx
        # update H
        WtA = np.dot(np.transpose(W), A)
        WtW = np.dot(np.transpose(W), W)
        # to avoid divide by zero error
        WtWDiag = np.array(np.diag(WtW))
        WtWDiag[WtWDiag == 0] = np.finfo(float).eps
        for x in range(0, k, 1):
            Hx = H[x, :] + (np.array(WtA[x, :])-np.dot(WtW[x, :], H))/WtWDiag[x]
            Hx = Hx-beta/WtWDiag[x]
            Hx[Hx < np.finfo(float).eps] = np.finfo(float).eps
            H[x, :] = Hx
        if currentIteration > 1:
            prevError = currError
        errChange[currentIteration] = prevError
        currError = np.linalg.norm(A-np.dot(W, H), ord='fro')
        currentIteration = currentIteration+1
    return W, H, errChange


def svd(bag_of_words):
    """
    Function to use SVD for getting outlier_matrix. It uses SVD from np.linalg and result representation\
    from the paper [Outlier Detection for Text Data: An Extended Version](https://arxiv.org/pdf/1701.01325.pdf) (page 8)

    ---

    **Arguments**\n
    `bag_of_words` (array of shape (n_documents, n_words)): document-word matrix.

    ---

    **Returns**\n
    `outlier_matrix` (array of shape (n_documents, n_words)): outliers matrix.
    """

    _, S, V = np.linalg.svd(bag_of_words, full_matrices=False)
    outlier_matrix = np.dot(np.sqrt(np.diag(S)), V)
    return outlier_matrix


def outlier_detection(split_search_results_df, method="tonmf", norm="l2", stop_words_lang="english", Z_threshold=3, label_col="topic_name", text_col="text", k=10, alpha=10, beta=0.05):
    """
    Function used to detect outliers in list of texts

    ---

    **Arguments**\n
    `split_search_results_df` (DataFrame): DataFrame with search results split on sentences and which contains\
    *topic_name*, *document_id*, *text*, *sentence*.\n
    `method` (string): name of method to use for outlier detection. It should be 'rpca', 'tonmf' or 'svd'.\
    Default value = 'tonmf'. [This](https://github.com/dganguli/robust-pca) RPCA implementation is used. Python\
    implementation of TONMF based on [Outlier Detection for Text Data: An Extended Version](https://arxiv.org/pdf/1701.01325.pdf)\
    is used.\n
    `norm` (string): the norm to use to normalize. It should be 'l1', 'l2' or 'max'. Default value = 'l2'.\
    sklearn.preprocessing.normalize is used.\n
    `Z_threshold` (int): Threshold to filter outlier Z score. Default value = 3.\n
    `stop_words_lang` (String): Language for the stop words filtering. Default value = "english".\n
    `label_col` (String): Name of the label column in data frame. Default value = "topic_name".\n
    `text_col` (String): Name of the text column in data frame. Default value = "text".\n
    `k` (int): rank for tonmf.\n
    `alpha` (float): defines the weight for the outlier matrix Z for tonmf.\n
    `beta` (float): approximation parameter for tonmf.

    ---

    **Returns**\n
    `normal_texts_df` (DataFrame): DataFrame which contains *topic_name*, *document_id*, *text*, *sentence*.\n
    `outlier_texts_df` (DataFrame): DataFrame which contains *topic_name*, *document_id*, *text*, *sentence*.
    """

    nltk.download('stopwords')
    stop = stopwords.words(stop_words_lang)

    split_search_results_df['words'] = split_search_results_df[text_col].apply(
        lambda x: ' '.join([word for word in x.split() if word.lower() not in stop]))

    for topic in split_search_results_df[label_col].unique():
        # prepared_input_texts_df = split_search_results_df[split_search_results_df['topic_name'] == topic].copy()

        # getting bag_of_words
        bag_of_words = CountVectorizer().fit_transform(split_search_results_df[split_search_results_df[label_col] == topic]['words']).todense()

        if method == 'rpca':
            outlier_matrix = rpca_implementation(bag_of_words)
        elif method == 'tonmf':
            outlier_matrix, _, _, _ = tonmf(bag_of_words, k, alpha, beta)
        elif method == 'svd':
            outlier_matrix = svd(bag_of_words.T)
            outlier_matrix = outlier_matrix.T
        else:
            raise Exception('method should be in list ["tonmf", "rpca", "svd"]')

        if norm == "l2" or norm == "l1" or norm == "max":
            _, y_pred = preprocessing.normalize(outlier_matrix, axis=1, norm=norm, return_norm=True)
        else:
            raise Exception('norm should be in list ["l1", "l2", "max"]')

        # Z-score method for threshold calculation: https://stackoverflow.com/questions/41290525/outliers-using-rpca
        Z = stats.zscore(y_pred)
        split_search_results_df.loc[(split_search_results_df[label_col] == topic),'Z_score'] = np.abs(Z)

    split_search_results_df['Z_score'] = split_search_results_df['Z_score'].fillna(0)
    return split_search_results_df[split_search_results_df['Z_score'] <= Z_threshold], split_search_results_df[split_search_results_df['Z_score'] > Z_threshold]

Functions

def outlier_detection(split_search_results_df, method='tonmf', norm='l2', stop_words_lang='english', Z_threshold=3, label_col='topic_name', text_col='text', k=10, alpha=10, beta=0.05)

Function used to detect outliers in list of texts


Arguments

split_search_results_df (DataFrame): DataFrame with search results split on sentences and which contains topic_name, document_id, text, sentence.

method (string): name of method to use for outlier detection. It should be 'rpca', 'tonmf' or 'svd'. Default value = 'tonmf'. This RPCA implementation is used. Python implementation of TONMF based on Outlier Detection for Text Data: An Extended Version is used.

norm (string): the norm to use to normalize. It should be 'l1', 'l2' or 'max'. Default value = 'l2'. sklearn.preprocessing.normalize is used.

Z_threshold (int): Threshold to filter outlier Z score. Default value = 3.

stop_words_lang (String): Language for the stop words filtering. Default value = "english".

label_col (String): Name of the label column in data frame. Default value = "topic_name".

text_col (String): Name of the text column in data frame. Default value = "text".

k (int): rank for tonmf.

alpha (float): defines the weight for the outlier matrix Z for tonmf.

beta (float): approximation parameter for tonmf.


Returns

normal_texts_df (DataFrame): DataFrame which contains topic_name, document_id, text, sentence.

outlier_texts_df (DataFrame): DataFrame which contains topic_name, document_id, text, sentence.

Expand source code
def outlier_detection(split_search_results_df, method="tonmf", norm="l2", stop_words_lang="english", Z_threshold=3, label_col="topic_name", text_col="text", k=10, alpha=10, beta=0.05):
    """
    Function used to detect outliers in list of texts

    ---

    **Arguments**\n
    `split_search_results_df` (DataFrame): DataFrame with search results split on sentences and which contains\
    *topic_name*, *document_id*, *text*, *sentence*.\n
    `method` (string): name of method to use for outlier detection. It should be 'rpca', 'tonmf' or 'svd'.\
    Default value = 'tonmf'. [This](https://github.com/dganguli/robust-pca) RPCA implementation is used. Python\
    implementation of TONMF based on [Outlier Detection for Text Data: An Extended Version](https://arxiv.org/pdf/1701.01325.pdf)\
    is used.\n
    `norm` (string): the norm to use to normalize. It should be 'l1', 'l2' or 'max'. Default value = 'l2'.\
    sklearn.preprocessing.normalize is used.\n
    `Z_threshold` (int): Threshold to filter outlier Z score. Default value = 3.\n
    `stop_words_lang` (String): Language for the stop words filtering. Default value = "english".\n
    `label_col` (String): Name of the label column in data frame. Default value = "topic_name".\n
    `text_col` (String): Name of the text column in data frame. Default value = "text".\n
    `k` (int): rank for tonmf.\n
    `alpha` (float): defines the weight for the outlier matrix Z for tonmf.\n
    `beta` (float): approximation parameter for tonmf.

    ---

    **Returns**\n
    `normal_texts_df` (DataFrame): DataFrame which contains *topic_name*, *document_id*, *text*, *sentence*.\n
    `outlier_texts_df` (DataFrame): DataFrame which contains *topic_name*, *document_id*, *text*, *sentence*.
    """

    nltk.download('stopwords')
    stop = stopwords.words(stop_words_lang)

    split_search_results_df['words'] = split_search_results_df[text_col].apply(
        lambda x: ' '.join([word for word in x.split() if word.lower() not in stop]))

    for topic in split_search_results_df[label_col].unique():
        # prepared_input_texts_df = split_search_results_df[split_search_results_df['topic_name'] == topic].copy()

        # getting bag_of_words
        bag_of_words = CountVectorizer().fit_transform(split_search_results_df[split_search_results_df[label_col] == topic]['words']).todense()

        if method == 'rpca':
            outlier_matrix = rpca_implementation(bag_of_words)
        elif method == 'tonmf':
            outlier_matrix, _, _, _ = tonmf(bag_of_words, k, alpha, beta)
        elif method == 'svd':
            outlier_matrix = svd(bag_of_words.T)
            outlier_matrix = outlier_matrix.T
        else:
            raise Exception('method should be in list ["tonmf", "rpca", "svd"]')

        if norm == "l2" or norm == "l1" or norm == "max":
            _, y_pred = preprocessing.normalize(outlier_matrix, axis=1, norm=norm, return_norm=True)
        else:
            raise Exception('norm should be in list ["l1", "l2", "max"]')

        # Z-score method for threshold calculation: https://stackoverflow.com/questions/41290525/outliers-using-rpca
        Z = stats.zscore(y_pred)
        split_search_results_df.loc[(split_search_results_df[label_col] == topic),'Z_score'] = np.abs(Z)

    split_search_results_df['Z_score'] = split_search_results_df['Z_score'].fillna(0)
    return split_search_results_df[split_search_results_df['Z_score'] <= Z_threshold], split_search_results_df[split_search_results_df['Z_score'] > Z_threshold]
def rpca_implementation(bag_of_words)

Function to use RPCA for getting outlier_matrix. It uses this RPCA implementation


Arguments

bag_of_words (array of shape (n_documents, n_words)): document-word matrix.


Returns

outlier_matrix (array of shape (n_documents, n_words)): outliers matrix.

Expand source code
def rpca_implementation(bag_of_words):
    """
    Function to use RPCA for getting outlier_matrix. It uses [this](https://github.com/dganguli/robust-pca) RPCA implementation

    ---

    **Arguments**\n
    `bag_of_words` (array of shape (n_documents, n_words)): document-word matrix.

    ---

    **Returns**\n
    `outlier_matrix` (array of shape (n_documents, n_words)): outliers matrix.
    """

    rpca = _R_pca(np.array(bag_of_words))
    _, outlier_matrix = rpca.fit(max_iter=10000, iter_print=100)
    return outlier_matrix
def svd(bag_of_words)

Function to use SVD for getting outlier_matrix. It uses SVD from np.linalg and result representation from the paper Outlier Detection for Text Data: An Extended Version (page 8)


Arguments

bag_of_words (array of shape (n_documents, n_words)): document-word matrix.


Returns

outlier_matrix (array of shape (n_documents, n_words)): outliers matrix.

Expand source code
def svd(bag_of_words):
    """
    Function to use SVD for getting outlier_matrix. It uses SVD from np.linalg and result representation\
    from the paper [Outlier Detection for Text Data: An Extended Version](https://arxiv.org/pdf/1701.01325.pdf) (page 8)

    ---

    **Arguments**\n
    `bag_of_words` (array of shape (n_documents, n_words)): document-word matrix.

    ---

    **Returns**\n
    `outlier_matrix` (array of shape (n_documents, n_words)): outliers matrix.
    """

    _, S, V = np.linalg.svd(bag_of_words, full_matrices=False)
    outlier_matrix = np.dot(np.sqrt(np.diag(S)), V)
    return outlier_matrix
def tonmf(A, k, alpha, beta)

Function to use TONMF for getting outlier_matrix. Solves the equation A = ||A-Z-WH||_F^2 + alpha ||Z||_2,1 + beta ||H||_1 A is a sparse matrix of size mxn Z matrix is the outlier matrix.


Arguments

A (array of shape (n_documents, n_words)): document-word matrix.

k (int): rank.

alpha (float): defines the weight for the outlier matrix Z.

beta (float): approximation parameter.


Returns

Z (array of shape (n_documents, n_words)): Outlier matrix.

W (array of shape (n_documents, rank)): Term-Topic matrix.

H (array of shape (rank, n_words)): Topic-Document matrix.

errChange (array): errors history.

Expand source code
def tonmf(A, k, alpha, beta):
    """
    Function to use TONMF for getting outlier_matrix. Solves the equation
    A = ||A-Z-WH||_F^2 + alpha ||Z||_2,1 + beta ||H||_1
    A is a sparse matrix of size mxn
    Z matrix is the outlier matrix.

    ---

    **Arguments**\n
    `A` (array of shape (n_documents, n_words)): document-word matrix.\n
    `k` (int): rank.\n
    `alpha` (float): defines the weight for the outlier matrix Z.\n
    `beta` (float): approximation parameter.

    ---

    **Returns**\n
    `Z` (array of shape (n_documents, n_words)): Outlier matrix.\n
    `W` (array of shape (n_documents, rank)): Term-Topic matrix.\n
    `H` (array of shape (rank, n_words)): Topic-Document matrix.\n
    `errChange` (array): errors history.
    """

    m, n = A.shape
    # numIterations is used for convergence of W,H
    numIterationsWH = 10
    numIterations = 10
    # first fix W,H and solve Z
    W = np.random.random((m, k)).reshape([k, m]).T
    H = np.random.random((k, n)).reshape([n, k]).T
    D = A-np.dot(W, H)
    currentIteration = 1
    Z = np.zeros((m, n))
    prevErr = np.linalg.norm(D, ord='fro') + beta*np.linalg.norm(H, ord=1)
    currentErr = prevErr-1
    errChange = np.zeros((numIterations+1, 1))
    # convergence is when A \approx WH
    while currentIteration < numIterations:
        colnormdi = np.sqrt(np.sum(np.square(D), axis = 0))
        colnormdi_factor = colnormdi-alpha
        colnormdi_factor[colnormdi_factor < 0] = 0
        Z = np.divide(D, colnormdi)
        Z = np.array(Z) * np.array(colnormdi_factor)
        D = A-Z
        W, H, _ = _hals(D, W, H, k, 1e-6, numIterationsWH, beta)
        D = A-np.dot(W, H)
        if currentIteration > 1:
            prevErr = currentErr
        errChange[currentIteration] = prevErr
        currentErr = np.linalg.norm(D-Z, ord='fro') + alpha * np.sum(np.sqrt(np.sum(np.square(Z)))) + beta * np.linalg.norm(H, ord=1)
        currentIteration = currentIteration + 1
    errChange[currentIteration]=currentErr
    return Z, W, H, errChange