Module textcl.preprocessing
This module contains functions for general text preprocessing and filtering.
Expand source code
"""
This module contains functions for general text preprocessing and filtering.
"""
import pandas as pd
import re
from langdetect import detect_langs
import math
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel
import warnings
warnings.filterwarnings("ignore", '.*future version*')
def perplexity_filtering(sentences_df, threshold=1000, sentence_col="sentence"):
"""
Function used to filter sentences by perplexity
---
**Arguments**\n
`sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column.\n
`threshold` (int): Perplexity threshold used for filtering. Default value = 1000.\n
`embeddings` (String): Pretrained embeddings used for preplexity filtering. Default value = "news-forward".\n
`sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".
---
**Returns**\n
`sentences_df` DataFrame filtered by perplexity.
"""
# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
def score(sentence):
tokenize_input = tokenizer.tokenize(sentence)
tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
loss=model(tensor_input, lm_labels=tensor_input)
return math.exp(loss.item())
l = list(sentences_df)
sentences_df['perplexity'] = sentences_df[sentence_col].apply(lambda x: score(x) if len(re.sub('[^0-9a-zA-Z ]', '', x)) > 0 else -1.0)
return sentences_df[(sentences_df['perplexity'] <= threshold) & (sentences_df['perplexity'] != - 1.0)][l]
def language_filtering(sentences_df, threshold=0.99, language='en', sentence_col="sentence"):
"""
Function used to filter sentences by language
---
**Arguments**\n
`sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column. \n
`threshold` (float): Language score threshold used for filtering. Default value = 0.99. \n
`language` (str, optional): Language of sentences. Default value = 'en'. \n
`sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".
---
**Returns**\n
`sentences_df` DataFrame filtered by language.
"""
def language_score(sentence, language = language):
lang_score = 0
try:
language_detection_result = detect_langs(sentence)
for result in language_detection_result:
if result.lang == language:
lang_score = result.prob
except:
warnings.warn('Problem with detecting language for the sentence')
return lang_score
l = list(sentences_df)
sentences_df['lang_score'] = sentences_df[sentence_col].apply(lambda x: language_score(x, language))
return sentences_df[sentences_df['lang_score'] > threshold].reset_index(drop=True)[l]
def jaccard_sim_filtering(sentences_df, sentece_col="sentence", threshold=0.8):
"""
Function used to filter sentences by Jaccard similarity
---
**Arguments**\n
`sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column. \n
`sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n
`threshold` (float): Jaccard similarity score threshold used for filtering. Default value = 0.8.
---
**Returns**\n
`sentences_df` filtered by Jaccard similarity.
"""
sentence_set_list = sentences_df[sentece_col].str.split(' ').apply(lambda x: set(x)).values
for i in range(0, len(sentence_set_list), 1):
for j in range(i + 1, len(sentence_set_list), 1):
a = sentence_set_list[i]
b = sentence_set_list[j]
c = a.intersection(b)
sim_score = float(len(c)) / (len(a) + len(b) - len(c))
if sim_score > threshold:
sentences_df.loc[i, sentece_col] = 'FILTERED'
break
return sentences_df[~(sentences_df[sentece_col] == 'FILTERED')].reset_index(drop=True)
def join_sentences_by_label(grouped_sentences_df, label_col="topic_name", sentence_col="sentence"):
"""
Function used to join sentences into texts. Sentences are grouped by topics
---
**Arguments**\n
`grouped_sentences_df` (DataFrame): DataFrame with sentences groped by topics and which contains
*label_col*, *sentence_col* columns. \n
`label_col` (String): Name of the label column in data frame. Default value = "topic_name".\n
`sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n
---
**Returns**\n
`joined_df`: DataFrame with columns *label_column_name*, *joined_sentences*.
"""
return grouped_sentences_df.groupby([label_col])[sentence_col].apply(' '.join).reset_index()
def split_into_sentences(text_df, text_col="text", sentence_col="sentence"):
"""
Function used to texts into sentences
---
**Arguments**\n
`text_df` (DataFrame): DataFrame with search results which contains *topic_name*, *document_id*, *text*.
`text_col` (String): Name of the text column in data frame. Default value = "text".\n
`sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n
**Returns**\n
`split_text_df` with the same structure as text_df and *sentence* columns.
"""
text_df[sentence_col] = text_df[text_col].str.strip().replace(r'\s*•', '.').str.split(r'(?<=[.!?…]) ')
return text_df.explode(sentence_col).reset_index(drop = True)
Functions
def jaccard_sim_filtering(sentences_df, sentece_col='sentence', threshold=0.8)
-
Function used to filter sentences by Jaccard similarity
Arguments
sentences_df
(DataFrame): DataFrame with sentences and which contains sentence column.sentence_col
(String): Name of the sentence column in data frame. Default value = "sentence".threshold
(float): Jaccard similarity score threshold used for filtering. Default value = 0.8.
Returns
sentences_df
filtered by Jaccard similarity.Expand source code
def jaccard_sim_filtering(sentences_df, sentece_col="sentence", threshold=0.8): """ Function used to filter sentences by Jaccard similarity --- **Arguments**\n `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column. \n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n `threshold` (float): Jaccard similarity score threshold used for filtering. Default value = 0.8. --- **Returns**\n `sentences_df` filtered by Jaccard similarity. """ sentence_set_list = sentences_df[sentece_col].str.split(' ').apply(lambda x: set(x)).values for i in range(0, len(sentence_set_list), 1): for j in range(i + 1, len(sentence_set_list), 1): a = sentence_set_list[i] b = sentence_set_list[j] c = a.intersection(b) sim_score = float(len(c)) / (len(a) + len(b) - len(c)) if sim_score > threshold: sentences_df.loc[i, sentece_col] = 'FILTERED' break return sentences_df[~(sentences_df[sentece_col] == 'FILTERED')].reset_index(drop=True)
def join_sentences_by_label(grouped_sentences_df, label_col='topic_name', sentence_col='sentence')
-
Function used to join sentences into texts. Sentences are grouped by topics
Arguments
grouped_sentences_df
(DataFrame): DataFrame with sentences groped by topics and which contains label_col, sentence_col columns.label_col
(String): Name of the label column in data frame. Default value = "topic_name".sentence_col
(String): Name of the sentence column in data frame. Default value = "sentence".
Returns
joined_df
: DataFrame with columns label_column_name, joined_sentences.Expand source code
def join_sentences_by_label(grouped_sentences_df, label_col="topic_name", sentence_col="sentence"): """ Function used to join sentences into texts. Sentences are grouped by topics --- **Arguments**\n `grouped_sentences_df` (DataFrame): DataFrame with sentences groped by topics and which contains *label_col*, *sentence_col* columns. \n `label_col` (String): Name of the label column in data frame. Default value = "topic_name".\n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n --- **Returns**\n `joined_df`: DataFrame with columns *label_column_name*, *joined_sentences*. """ return grouped_sentences_df.groupby([label_col])[sentence_col].apply(' '.join).reset_index()
def language_filtering(sentences_df, threshold=0.99, language='en', sentence_col='sentence')
-
Function used to filter sentences by language
Arguments
sentences_df
(DataFrame): DataFrame with sentences and which contains sentence column.threshold
(float): Language score threshold used for filtering. Default value = 0.99.language
(str, optional): Language of sentences. Default value = 'en'.sentence_col
(String): Name of the sentence column in data frame. Default value = "sentence".
Returns
sentences_df
DataFrame filtered by language.Expand source code
def language_filtering(sentences_df, threshold=0.99, language='en', sentence_col="sentence"): """ Function used to filter sentences by language --- **Arguments**\n `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column. \n `threshold` (float): Language score threshold used for filtering. Default value = 0.99. \n `language` (str, optional): Language of sentences. Default value = 'en'. \n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence". --- **Returns**\n `sentences_df` DataFrame filtered by language. """ def language_score(sentence, language = language): lang_score = 0 try: language_detection_result = detect_langs(sentence) for result in language_detection_result: if result.lang == language: lang_score = result.prob except: warnings.warn('Problem with detecting language for the sentence') return lang_score l = list(sentences_df) sentences_df['lang_score'] = sentences_df[sentence_col].apply(lambda x: language_score(x, language)) return sentences_df[sentences_df['lang_score'] > threshold].reset_index(drop=True)[l]
def perplexity_filtering(sentences_df, threshold=1000, sentence_col='sentence')
-
Function used to filter sentences by perplexity
Arguments
sentences_df
(DataFrame): DataFrame with sentences and which contains sentence column.threshold
(int): Perplexity threshold used for filtering. Default value = 1000.embeddings
(String): Pretrained embeddings used for preplexity filtering. Default value = "news-forward".sentence_col
(String): Name of the sentence column in data frame. Default value = "sentence".
Returns
sentences_df
DataFrame filtered by perplexity.Expand source code
def perplexity_filtering(sentences_df, threshold=1000, sentence_col="sentence"): """ Function used to filter sentences by perplexity --- **Arguments**\n `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column.\n `threshold` (int): Perplexity threshold used for filtering. Default value = 1000.\n `embeddings` (String): Pretrained embeddings used for preplexity filtering. Default value = "news-forward".\n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence". --- **Returns**\n `sentences_df` DataFrame filtered by perplexity. """ # Load pre-trained model (weights) model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') def score(sentence): tokenize_input = tokenizer.tokenize(sentence) tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]) loss=model(tensor_input, lm_labels=tensor_input) return math.exp(loss.item()) l = list(sentences_df) sentences_df['perplexity'] = sentences_df[sentence_col].apply(lambda x: score(x) if len(re.sub('[^0-9a-zA-Z ]', '', x)) > 0 else -1.0) return sentences_df[(sentences_df['perplexity'] <= threshold) & (sentences_df['perplexity'] != - 1.0)][l]
def split_into_sentences(text_df, text_col='text', sentence_col='sentence')
-
Function used to texts into sentences
Arguments
text_df
(DataFrame): DataFrame with search results which contains topic_name, document_id, text.text_col
(String): Name of the text column in data frame. Default value = "text".sentence_col
(String): Name of the sentence column in data frame. Default value = "sentence".Returns
split_text_df
with the same structure as text_df and sentence columns.Expand source code
def split_into_sentences(text_df, text_col="text", sentence_col="sentence"): """ Function used to texts into sentences --- **Arguments**\n `text_df` (DataFrame): DataFrame with search results which contains *topic_name*, *document_id*, *text*. `text_col` (String): Name of the text column in data frame. Default value = "text".\n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence".\n **Returns**\n `split_text_df` with the same structure as text_df and *sentence* columns. """ text_df[sentence_col] = text_df[text_col].str.strip().replace(r'\s*•', '.').str.split(r'(?<=[.!?…]) ') return text_df.explode(sentence_col).reset_index(drop = True)