# Reload modules every time before executing the Python code typed
%load_ext autoreload
%autoreload 2

# Import from parent directory
import sys; sys.path.insert(0, '..')

# Configure data storage
from yaml import safe_load
import classes.io
io = classes.io.Io(safe_load(open('../config.yaml', 'r'))['DATA_DIRECTORY'])

# Additional imports
import classes.reduction

import matplotlib.pyplot as plt
import seaborn as sns

from classes.geometry import Geometry
from polylidar.polylidarutil import plot_polygons

from gensim.utils import simple_preprocess
from wordcloud import WordCloud, STOPWORDS
from collections import Counter


#plt.rcParams["figure.figsize"] = (12,8)
plt.rcParams["figure.figsize"] = (6,4)


# load texts
dataset_id = 'amazon-movie-reviews-10000'
texts = io.load_data_pair(dataset_id, io.DATATYPE_TEXT)

Loaded /home/eml4u/EML4U/data/explanation/data/amazon-movie-reviews-10000/text.pickle


# load embeddings
datatype_id = 'doc2vec.dim50-epochs50'
embeddings = io.load_data_pair(dataset_id, io.DATATYPE_EMBEDDINGS, io.DESCRIPTOR_DOC_TO_VEC, 'dim50-epochs50')

Loaded /home/eml4u/EML4U/data/explanation/data/amazon-movie-reviews-10000/doc2vec.dim50-epochs50.embeddings.pickle


dimension_reduction = classes.reduction.Reduction()
#embeddings_a, embeddings_b = dimension_reduction.pca_dict(embeddings.get_a(), embeddings.get_b())
pca_a, pca_b = dimension_reduction.pca(list(embeddings.get_a().values()), list(embeddings.get_b().values()))

PCA seconds: 0.13505211472511292


def plot(data_a, data_b):
    plot_data = data_a
    sns.scatterplot(data=plot_data, x=plot_data[:,0], y=plot_data[:,1], palette=['#3465A4'], hue=1, alpha = 0.3)
    plot_data = data_b
    sns.scatterplot(data=plot_data, x=plot_data[:,0], y=plot_data[:,1], palette=['#F57900'], hue=5, alpha = 0.3)


#plot(list(embeddings_a.values()), list(embeddings_b.values()))
plot(pca_a, pca_b)


def plot_points_polygons(points, polygons):
    fig, ax = plt.subplots(nrows=1, ncols=1)
    ax.scatter(points[:, 0], points[:, 1], c='k')
    plot_polygons(polygons, points, ax)
    plt.axis('equal')
    plt.show()


geometry = Geometry()
plot_points_polygons(pca_a, geometry.extract_polygons(pca_a))
plot_points_polygons(pca_b, geometry.extract_polygons(pca_b))

/home/eml4u/.local/lib/python3.8/site-packages/descartes/patch.py:62: ShapelyDeprecationWarning: The array interface is deprecated and will no longer work in Shapely 2.0. Convert the '.coords' to a numpy array instead.
  vertices = concatenate([


polygon_indexes_a = geometry.extract_polygon_indexes(pca_a)
polygon_indexes_b = geometry.extract_polygon_indexes(pca_b)

polygon_a = geometry.create_polygon(pca_a, polygon_indexes_a[0])
polygon_b = geometry.create_polygon(pca_b, polygon_indexes_b[0])
polygon_a_not_b = polygon_a - polygon_b
polygon_b_not_a = polygon_b - polygon_a


polygon_a


polygon_b


polygon_a_not_b


polygon_b_not_a


indexes_only_a = geometry.get_indexes_of_points_in_polygon(pca_a, list(embeddings.get_a().keys()), polygon_a_not_b)
indexes_only_b = geometry.get_indexes_of_points_in_polygon(pca_b, list(embeddings.get_b().keys()), polygon_b_not_a)


tokens_a = []
for index in indexes_only_a:
    tokens_a += simple_preprocess(texts.get_a()[index], deacc=False, min_len=2, max_len=15)
tokens_b = []
for index in indexes_only_b:
    tokens_b += simple_preprocess(texts.get_b()[index], deacc=False, min_len=2, max_len=15)

stopwords = set(STOPWORDS)
stopwords.add('br')
tokens_a = [w for w in tokens_a if w not in stopwords]
tokens_b = [w for w in tokens_b if w not in stopwords]

counts_a = Counter(tokens_a)
counts_b = Counter(tokens_b)


font_path='/usr/share/fonts/truetype/noto/NotoSans-Bold.ttf' #  fc-list | grep 'NotoSans-Bold'
wordcloud_a_counts = WordCloud(background_color="white", font_path=font_path, colormap='Dark2', width=1200, height=800).generate_from_frequencies(counts_a)
wordcloud_b_counts = WordCloud(background_color="white", font_path=font_path, colormap='Dark2', width=1200, height=800).generate_from_frequencies(counts_b)


plt.imshow(wordcloud_a_counts)
plt.axis("off")
#plt.savefig('2021-10-27-wordcloud_a_counts.png', dpi=300, bbox_inches='tight')

(-0.5, 1199.5, 799.5, -0.5)


plt.imshow(wordcloud_b_counts)
plt.axis("off")
#plt.savefig('2021-10-27-wordcloud_b_counts.png', dpi=300, bbox_inches='tight')

(-0.5, 1199.5, 799.5, -0.5)


tokens_a_set = set(tokens_a)
tokens_b_set = set(tokens_b)
tokens_a2 = [x for x in tokens_a if x not in tokens_b_set]
tokens_b2 = [x for x in tokens_b if x not in tokens_a_set]

counts_a = Counter(tokens_a2)
counts_b = Counter(tokens_b2)

wordcloud_a_counts = WordCloud(background_color="white", font_path=font_path, colormap='Dark2', width=1200, height=800).generate_from_frequencies(counts_a)
wordcloud_b_counts = WordCloud(background_color="white", font_path=font_path, colormap='Dark2', width=1200, height=800).generate_from_frequencies(counts_b)


plt.imshow(wordcloud_a_counts)
plt.axis("off")
#plt.savefig('2021-10-27-wordcloud_a_counts2.png', dpi=300, bbox_inches='tight')

(-0.5, 1199.5, 799.5, -0.5)


plt.imshow(wordcloud_b_counts)
plt.axis("off")
#plt.savefig('2021-10-27-wordcloud_b_counts2.png', dpi=300, bbox_inches='tight')

(-0.5, 1199.5, 799.5, -0.5)

Explaining Drift: Embeddings and Geometry¶

Source data¶

Embeddings¶

Dimension reduction¶

Plot¶

Extract polygons¶

Unique polygon parts¶

Get points inside unique polygons¶

Get words (tokens)¶

Wordcloud¶

Wordcloud with unique words¶