6 minute read

“Fake News”—is one of the defining features of contemporary democratic life.

In this Blog Post, we will develop and assess a fake news classifier using Tensorflow.

Note: Working on this Blog Post in Google Colab is highly recommended.

Import

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import numpy as np
import pandas as pd
import re
import string
from matplotlib import pyplot as plt
import plotly.express as px 
import plotly.io as pio
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, losses, utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.decomposition import PCA 
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

Acquire Training Data

train_url = "https://github.com/PhilChodrow/PIC16b/blob/master/datasets/fake_news_train.csv?raw=true"
raw_df = pd.read_csv(train_url)
raw_df.head()
Unnamed: 0 title text fake
0 17366 Merkel: Strong result for Austria's FPO 'big c... German Chancellor Angela Merkel said on Monday... 0
1 5634 Trump says Pence will lead voter fraud panel WEST PALM BEACH, Fla.President Donald Trump sa... 0
2 17487 JUST IN: SUSPECTED LEAKER and “Close Confidant... On December 5, 2017, Circa s Sara Carter warne... 1
3 12217 Thyssenkrupp has offered help to Argentina ove... Germany s Thyssenkrupp, has offered assistance... 0
4 5535 Trump say appeals court decision on travel ban... President Donald Trump on Thursday called the ... 0

Make a Dataset

make_dataset Funciton

This function does two things:

  1. Remove stopwords (such as “the,” “and,” or “but”) from the article text and title.
  2. Constructs and returns tf.data.Dataset with two inputs and one output. The input is in the form of (title, text), and the output consist only of the fake column.
def make_dataset(df):
  stop_words = stopwords.words('english')

  # remove stop words from titles and texts
  df["title"] = df["title"].apply(lambda title: ' '.join([word for word in title.split() if word not in (stop_words)])) 
  df['text'] = df['text'].apply(lambda text: ' '.join([word for word in text.split() if word not in (stop_words)])) 
  
  dataset = tf.data.Dataset.from_tensor_slices((
      # dictionary for input data
       {"title": df[["title"]], "text": df[["text"]]},
       # dictionary for output data
        { "fake": df["fake"]}   
        ))
  
  return dataset.batch(100) # batch the dataset

Split Dataset for Validation

spliting 20% of dataset to use for validation.

# Process data
df = make_dataset(raw_df)
df = df.shuffle(buffer_size = len(df))

# Split the dataset 
train_size = int(0.8*len(df)) 
val_size   = int(0.2*len(df)) 

train = df.take(train_size)
val = df.skip(train_size).take(val_size)

# Print Results 
print("Train Size: ", len(train))
print("Validation Size: ", len(val))
Train Size:  180
Validation Size:  45

##Base Rate

labels_iterator= train.unbatch().map(lambda dict_title_text, label: label).as_numpy_iterator()

real = 0 
fake = 0 

for label in labels_iterator:
    if label["fake"]==0: #if label is not fake, increase the count of the real
        real +=1 
    else: #if label is fake, increase the count of the fake
        fake +=1

print("Real: ", real)
print("Fake: ", fake)
Real:  8521
Fake:  9479

Our base rate (accuracy when a model makes only the same guess) prediction is somewhere around 50%

Text Vectorization

#preparing a text vectorization layer for tf model
size_vocabulary = 2000

def standardization(input_data):
    lowercase = tf.strings.lower(input_data)
    no_punctuation = tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation),'')
    return no_punctuation 

title_vectorize_layer = TextVectorization(
    standardize=standardization,
    max_tokens=size_vocabulary, # only consider this many words
    output_mode='int',
    output_sequence_length=500) 

title_vectorize_layer.adapt(train.map(lambda x, y: x["title"]))

Create Models

Using Functional API (rather than Sequential API)

First Model

Using titles to detect fake news

# Input Layer
titles_input = keras.Input(
    shape = (1,), 
    name = "title",
    dtype = "string"
)

# Hidden Layers
titles_features = title_vectorize_layer(titles_input) 
titles_features = layers.Embedding(size_vocabulary, output_dim = 2, name = "embedding")(titles_features) 
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.GlobalAveragePooling1D()(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.Dense(32)(titles_features)

# Output Layer
output = layers.Dense(2, name = "fake")(titles_features) 

model1 = keras.Model(
    inputs = titles_input,
    outputs = output
) 
keras.utils.plot_model(model1)

png

# compile model1
model1.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])
# fit model1
history1 = model1.fit(train, 
                      validation_data = val,
                      epochs = 20, 
                      verbose = False)
/usr/local/lib/python3.10/dist-packages/keras/engine/functional.py:639: UserWarning: Input dict contained keys ['text'] which did not match any model input. They will be ignored by the model.
  inputs = self._flatten_to_reference_inputs(inputs)
# Visualize Accuracy
plt.plot(history1.history["accuracy"], label = "training")
plt.plot(history1.history["val_accuracy"], label = "validation")
plt.legend()
<matplotlib.legend.Legend at 0x7f8a200e80a0>

png

print("<Model 1 Final Accuracy: ", history1.history["val_accuracy"][-1])
<Model 1 Final Accuracy:  0.9857777953147888

Second Model

Using article text to detect fake news

# Text Vectorization Layer
text_vectorize_layer = TextVectorization(
    standardize=standardization,
    max_tokens=size_vocabulary, # only consider this many words
    output_mode='int',
    output_sequence_length=500) 

text_vectorize_layer.adapt(train.map(lambda x, y: x["text"]))
# Input Layer
text_input = keras.Input(
    shape = (1,), 
    name = "text",
    dtype = "string"
)

# Hidden Layer
text_features = text_vectorize_layer(text_input) 
text_features = layers.Embedding(size_vocabulary, output_dim = 2, name = "embedding2")(text_features) 
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.GlobalAveragePooling1D()(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.Dense(32, activation='relu')(text_features)

# Output layer
output = layers.Dense(2, name = "fake")(text_features) 

# Create model2
model2 = keras.Model(
    inputs = text_input,
    outputs = output
) 
# Visualize model2
keras.utils.plot_model(model2)

png

# Compile model2
model2.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])
# Fit model2
history2 = model2.fit(train, 
                      validation_data = val,
                      epochs = 20, 
                      verbose = False)
/usr/local/lib/python3.10/dist-packages/keras/engine/functional.py:639: UserWarning: Input dict contained keys ['title'] which did not match any model input. They will be ignored by the model.
  inputs = self._flatten_to_reference_inputs(inputs)
# Visualize Accuracy
plt.plot(history2.history["accuracy"], label = "training")
plt.plot(history2.history["val_accuracy"], label = "validation")
plt.legend()
<matplotlib.legend.Legend at 0x7f89a74064a0>

png

print("<Model 2 Final Accuracy: ", history2.history["val_accuracy"][-1])
<Model 2 Final Accuracy:  0.9913333058357239

Third Model

Using article titles & text to detect fake news

Two pipeline are same exact code from first two models

# First Pipeline
titles_features = title_vectorize_layer(titles_input) 
titles_features = layers.Embedding(size_vocabulary, output_dim = 2, name = "embedding_title")(titles_features) 
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.GlobalAveragePooling1D()(titles_features)
titles_features = layers.Dropout(0.2)(titles_features)
titles_features = layers.Dense(32)(titles_features)

# Second Pipeline
text_features = text_vectorize_layer(text_input) 
text_features = layers.Embedding(size_vocabulary, output_dim = 2, name = "embedding_text")(text_features) 
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.GlobalAveragePooling1D()(text_features)
text_features = layers.Dropout(0.2)(text_features)
text_features = layers.Dense(32, activation='relu')(text_features)

# Concatonate Two Pipelines
main = layers.concatenate([titles_features, text_features], axis = 1)

# Output Layer
output=layers.Dense(2,name='fake')(main)
# Create model3
model3 = keras.Model(
    inputs = [titles_input, text_input],
    outputs = output
)
# Visualize model3
keras.utils.plot_model(model3)

png

# Compile model3
model3.compile(optimizer="adam",
              loss = losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=["accuracy"])
# Fit model3
history3 = model3.fit(train, 
                      validation_data = val,
                      epochs = 20, 
                      verbose = False)
plt.plot(history3.history["accuracy"], label = "training")
plt.plot(history3.history["val_accuracy"], label = "validation")
plt.legend()
<matplotlib.legend.Legend at 0x7f89a512add0>

png

print("<Model 3 Final Accuracy: ", history3.history["val_accuracy"][-1])
<Model 3 Final Accuracy:  0.998651385307312
  1. Model 1 Final Accuracy: 0.987333357334137
  2. Model 2 Final Accuracy: 0.9877777695655823
  3. Model 3 Final Accuracy: 0.9973333477973938

Based on three models’ performances, it is best to use both title and text upon detecting fake news.

Model Evaluation

# Download & Process Test Data
test_url = "https://github.com/PhilChodrow/PIC16b/blob/master/datasets/fake_news_test.csv?raw=true" #test data
test_data = pd.read_csv(test_url)
test_data = make_dataset(test_data)

# Print Model Performance
print(model3.evaluate(test_data))
225/225 [==============================] - 2s 9ms/step - loss: 0.0191 - accuracy: 0.9947
[0.019058916717767715, 0.9946991205215454]

The final model got 99.5% accuracy.

Meaning, that the model will detect the fake news 99.5% of the time

Embedding Visualization

looking at the embedding learned by our model

Using 2D embedding

embedding_title Visualization

weights = model3.get_layer("embedding_title").get_weights()[0] # get the weights from the embedding layer
vocab = title_vectorize_layer.get_vocabulary()                 # get the vocabulary from our data prep for later

#Reducing to 2D dimension
pca = PCA(n_components=2)
weights = pca.fit_transform(weights)

embedding_df = pd.DataFrame({
    'word': vocab,
    'x0':weights[:, 0],
    'x1':weights[:, 1]
})

# Plot Embedding Layer
fig = px.scatter(embedding_df,
                x = "x0",
                y = "x1",
                size=[2]*len(embedding_df),
                hover_name = "word")

fig.show()

png

embedding_text Visualization

weights = model3.get_layer("embedding_text").get_weights()[0] # get the weights from the embedding layer
vocab = title_vectorize_layer.get_vocabulary()                 # get the vocabulary from our data prep for later

#Reducing to 2D dimension
pca = PCA(n_components=2)
weights = pca.fit_transform(weights)

embedding_df = pd.DataFrame({
    'word': vocab,
    'x0':weights[:, 0],
    'x1':weights[:, 1]
})

# Plot Embedding Layer
fig = px.scatter(embedding_df,
                x = "x0",
                y = "x1",
                size=[2]*len(embedding_df),
                hover_name = "word")

fig.show()

png

Embedding layer exposes minor detials of fake news that are not identifiable with normal human’s understanding. For example, “trumps” is highly correlated with fake news, but “trump’s” is highly correlated with real news. These minor differences are hard to notice but we can see from embedding visualization that our model is taking advantage of it.

Leave a comment