In [1]:
# Get datasets from: https://drive.google.com/open?id=16w02wuMOqoLm6-YlM-rhAC1uZthO-2A3
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import numpy as np

# authenticate user credentials
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# download datasets from GDrive
# uncomment the dataset needed

# Small_10_100 Dataset
downloaded = drive.CreateFile({'id': '19wcJs9TdIwhT85TGdsaw1GV1ypbLnOpK'})
downloaded.GetContentFile('small_10_100.zip')
# Large_10_1000 Dataset
#downloaded = drive.CreateFile({'id': '1pghUuYyaYsuwDCA7Ow1Y601kGl6qPboL'})
#downloaded.GetContentFile('large_10_1000.zip')
# Large_95_100 Dataset
#downloaded = drive.CreateFile({'id': '1f-o1ygRlTZtvhbC9naP721khFJicbp-n'})
#downloaded.GetContentFile('large_95_100.zip')
# Full Dataset
#downloaded = drive.CreateFile({'id': '1SSwOHbTTPQJcpjyMtl9s2vdFGNonCKV7'})
#downloaded.GetContentFile('full.zip')

# unzip the datasets
# uncomment the dataset needed
!unzip -o small_10_100.zip
#!unzip -o large_10_1000.zip
#!unzip -o large_95_100.zip
#!unzip -o full.zip




Archive:  small_10_100.zip
   creating: small_10_100/
  inflating: small_10_100/1-27       
  inflating: small_10_100/1-26       
  inflating: small_10_100/1-25       
  inflating: small_10_100/1-24       
  inflating: small_10_100/1-23       
  inflating: small_10_100/1-22       
  inflating: small_10_100/1-21       
  inflating: small_10_100/1-20       
  inflating: small_10_100/1-19       
  inflating: small_10_100/1-18       
  inflating: small_10_100/1-17       
  inflating: small_10_100/1-16       
  inflating: small_10_100/1-15       
  inflating: small_10_100/1-14       
  inflating: small_10_100/1-13       
  inflating: small_10_100/1-12       
  inflating: small_10_100/1-11       
  inflating: small_10_100/1-10       
  inflating: small_10_100/1-9        
  inflating: small_10_100/1-8        
  inflating: small_10_100/1-7        
  inflating: small_10_100/1-6        
  inflating: small_10_100/1-5        
  inflating: small_10_100/1-4        
  inflating: small_10_100/1-3     

In [2]:
%tensorflow_version 2.x
%load_ext tensorboard
import numpy as np
from sklearn.utils import shuffle

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dense, Flatten, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.regularizers import l2

# Additional Packages
from tensorflow.keras.layers import BatchNormalization, Activation, AveragePooling1D, ELU
from tensorflow.keras.optimizers import Adam, Adamax, RMSprop
from keras.initializers import glorot_uniform
from pathlib import Path
from keras.callbacks import CSVLogger
import shutil
import os


import tensorflow as tf
#tf.logging.set_verbosity(tf.logging.ERROR)


Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [3]:
# Run if switching from Large_95_100 Dataset to Full Dataset or vice versa.
# Both Datasets have the same folder name which can cause data to overlap
#shutil.rmtree('/content/undefended')

In [9]:
# Global Definitions
# Set the data_path based on the Dataset
# /content/small_10_100 - small_10_100 
# /content/large_10_1000 - large_10_1000
# /content/undefended - large_95_100, Full
data_path="/content/undefended/"  # trace data path
data_path="/content/small_10_100/"

# Change the num_sites based on the dataset
# 10 - small_10_100, large_10_1000
# 95 - large_95_100, full
num_sites=10              # number of sites (max 95)
num_instances=100         # number of instances per site (max 100)
file_ext=""                # trace file extension

# Changable Hyperparameters
# Set the input units (2000, 3000, 5000, etc)
max_length = 8000           # maximum number of packet directions to use

In [5]:
# Information about the dataset
from collections import defaultdict
count_dict = defaultdict(int)
samples = []
for filename in os.listdir(data_path):
    x, y = filename.split("-")
    count_dict[x] += 1
for x, count in count_dict.items():
    samples.append(count)

print("# of Classes: "+str(len(samples)))
print("Avg # of Samples per Class: "+str(np.average(samples)))

# of Classes: 10
Avg # of Samples per Class: 100.0


In [10]:
from tensorflow.python.ops.gen_linalg_ops import BatchCholesky

def get_data():
    """
    :return: a numpy ndarray of dimension (m x (n+1)) containing direction data
        loaded from the files, where `m` is the number of data samples and `n`
        is length of direction packets (restricted to 500 to consume less
        computation time and memory). The last column in the data contains the
        class labels of the `m` samples, which are the website numbers.

    This function loads the data from the files and creates a numpy data matrix
    with each row as a data sample and the columns containing packet direction.
    The last column of the data is the label, which is the website to which the
    instance belongs.
    """
    # read data from files
    print("loading data...")
    data = [] 
    for site in range(0, num_sites):
        for instance in range(0, num_instances):
            file_name = str(site) + "-" + str(instance)
            # Directory of the raw data
            with open(data_path + file_name + file_ext, "r") as file_pt:
                directions = []
                for line in file_pt:
                    x = line.strip().split('\t')
                    directions.append(1 if float(x[1]) > 0 else -1)
                if len(directions) < max_length:
                    zend = max_length - len(directions)
                    directions.extend([0] * zend)
                elif len(directions) > max_length:
                    directions = directions[:max_length]
                data.append(directions + [site])
    print("done")
    return np.array(data)

def split_data(X, Y, fraction=0.80, balance_dist=False):
    """
    :param X: a numpy ndarray of dimension (m x n) containing data samples
    :param Y: a numpy ndarray of dimension (m x 1) containing labels for X
    :param fraction: a value between 0 and 1, which will be the fraction of
        data split into training and test sets. value of `fraction` will be the
        training data and the rest being test data.
    :param balance_dist: boolean value. The split is performed with ensured
        class balance if the value is true.
    :return: X_train, Y_train, X_test, Y_test
    """
    X, Y = shuffle(X, Y)
    m, n = X.shape
    split_index = int(round(m*fraction))
    if balance_dist:
        X_train = np.zeros(shape=(split_index, n))
        X_test = np.zeros(shape=(m-split_index, n))
        Y_train = np.zeros(shape=(split_index,))
        Y_test = np.zeros(shape=(m-split_index,))
        labels = np.unique(Y)
        ind1 = 0
        ind2 = 0
        for i in np.arange(labels.size):
            indices = np.where(Y == labels[i])[0]
            split = int(round(len(indices)*fraction))

            X_train[ind1:ind1 + split, :] = X[indices[:split], :]
            X_test[ind2:ind2+(indices.size-split), :] = X[indices[split:], :]

            Y_train[ind1:ind1 + split] = Y[indices[:split]]
            Y_test[ind2:ind2+(indices.size-split)] = Y[indices[split:]]

            ind1 += split
            ind2 += indices.size-split
        X_train, Y_train = shuffle(X_train, Y_train)
        X_test, Y_test = shuffle(X_test, Y_test)
        return X_train, Y_train, X_test, Y_test
    return X[:split_index, :], Y[:split_index], \
        X[split_index:, :], Y[split_index:]

class AWF:
    """
    This class implements the AWF Model from "Automated Web Fingerprinting 
    through Deep Learning" by V. Rimmer et al.
    https://github.com/DistriNet/DLWF
    """
    def __init__(self, num_features, num_classes):
        model = Sequential()
        kernel_size = 5
        filters = 32
        pool_size = 4
        dropout = 0.1
        batch_size = 128
        lr = 0.001

        model.add(Dropout(input_shape=(num_features, 1), rate=dropout))
        model.add(Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
        model.add(MaxPooling1D(pool_size=pool_size, padding='valid'))
        model.add(Conv1D(filters=filters, kernel_size=kernel_size, padding='valid', activation='relu', strides=1))
        model.add(MaxPooling1D(pool_size=pool_size, padding='valid'))
        model.add(Flatten())
        model.add(Dense(units=num_classes, activation='softmax'))
        optimizer = RMSprop(lr=lr)
        model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
        self.model = model        
        print(self.model.summary())

    def fit(self, X_train, Y_train, batch_size, epochs, verbose):
        """
        :param X_train: a numpy ndarray of dimension (k x n) containing
            training data
        :param Y_train: a numpy ndarray of dimension (k x 1) containing
            labels for X_train
        :param batch_size: batch size to use for training
        :param epochs: number of epochs for training
        :param verbose: Console print options for training progress.
            0 - silent mode,
            1 - progress bar,
            2 - one line per epoch
        :return: None

        This method start training the model with the given data. The
        training options are configured with tensorboard and early stopping
        callbacks.

        Tensorboard could be launched by navigating to the directory
        containing this file in terminal and running the following command.
            > tensorboard --logdir graph
        """
        tboard_cb = TensorBoard(log_dir='./graph/AWF', histogram_freq=0,
                                write_graph=True, write_images=True)
        csv_logger = CSVLogger('logAWF.csv')
        self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=0.20,
                       callbacks=[tboard_cb, csv_logger])

class DF:
    """
    This class implements the DF model from "Deep Fingerprinting: Undermining 
    Website Fingerprinting Defenses with Deep Learning" by P. Sirinam, M. Imani,
    M. Juarez and M. Wright
    https://github.com/deep-fingerprinting/df
    """
    def __init__(self, num_features, num_classes):
        model = Sequential()

        filter_num = ['None',32,64,128,256]
        kernel_size = ['None',8,8,8,8]
        conv_stride_size = ['None',1,1,1,1]
        pool_stride_size = ['None',4,4,4,4]
        pool_size = ['None',8,8,8,8]

        model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1], input_shape=(num_features, 1),
                         strides=conv_stride_size[1], padding='same',
                         name='block1_conv1'))
        model.add(BatchNormalization())
        model.add(ELU(alpha=1.0, name='block1_adv_act1'))
        model.add(Conv1D(filters=filter_num[1], kernel_size=kernel_size[1],
                         strides=conv_stride_size[1], padding='same',
                         name='block1_conv2'))
        model.add(BatchNormalization())
        model.add(ELU(alpha=1.0, name='block1_adv_act2'))
        model.add(MaxPooling1D(pool_size=pool_size[1], strides=pool_stride_size[1],
                               padding='same', name='block1_pool'))
        model.add(Dropout(0.1, name='block1_dropout'))

        model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2],
                         strides=conv_stride_size[2], padding='same',
                         name='block2_conv1'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block2_act1'))

        model.add(Conv1D(filters=filter_num[2], kernel_size=kernel_size[2],
                         strides=conv_stride_size[2], padding='same',
                         name='block2_conv2'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block2_act2'))
        model.add(MaxPooling1D(pool_size=pool_size[2], strides=pool_stride_size[3],
                               padding='same', name='block2_pool'))
        model.add(Dropout(0.1, name='block2_dropout'))

        model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3],
                         strides=conv_stride_size[3], padding='same',
                         name='block3_conv1'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block3_act1'))
        model.add(Conv1D(filters=filter_num[3], kernel_size=kernel_size[3],
                         strides=conv_stride_size[3], padding='same',
                         name='block3_conv2'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block3_act2'))
        model.add(MaxPooling1D(pool_size=pool_size[3], strides=pool_stride_size[3],
                               padding='same', name='block3_pool'))
        model.add(Dropout(0.1, name='block3_dropout'))

        model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4],
                         strides=conv_stride_size[4], padding='same',
                         name='block4_conv1'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block4_act1'))
        model.add(Conv1D(filters=filter_num[4], kernel_size=kernel_size[4],
                         strides=conv_stride_size[4], padding='same',
                         name='block4_conv2'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='block4_act2'))
        model.add(MaxPooling1D(pool_size=pool_size[4], strides=pool_stride_size[4],
                               padding='same', name='block4_pool'))
        model.add(Dropout(0.1, name='block4_dropout'))

        model.add(Flatten(name='flatten'))
        model.add(Dense(512, name='fc1'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='fc1_act'))
        model.add(Dropout(0.7, name='fc1_dropout'))

        model.add(Dense(512, name='fc2'))
        model.add(BatchNormalization())
        model.add(Activation('relu', name='fc2_act'))
        model.add(Dropout(0.5, name='fc2_dropout'))

        model.add(Dense(num_classes, name='fc3'))
        model.add(Activation('softmax', name="softmax"))

        opti = Adamax(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
        model.compile(loss="categorical_crossentropy", optimizer=opti, metrics=["accuracy"])
        self.model = model
        print(self.model.summary())

    def fit(self, X_train, Y_train, batch_size, epochs, verbose):
        """
        :param X_train: a numpy ndarray of dimension (k x n) containing
            training data
        :param Y_train: a numpy ndarray of dimension (k x 1) containing
            labels for X_train
        :param batch_size: batch size to use for training
        :param epochs: number of epochs for training
        :param verbose: Console print options for training progress.
            0 - silent mode,
            1 - progress bar,
            2 - one line per epoch
        :return: None

        This method start training the model with the given data. The
        training options are configured with tensorboard and early stopping
        callbacks.

        Tensorboard could be launched by navigating to the directory
        containing this file in terminal and running the following command.
            > tensorboard --logdir graph
        """
        tboard_cb = TensorBoard(log_dir='./graph/DF', histogram_freq=0,
                                write_graph=True, write_images=True)
        csv_logger = CSVLogger('logDF.csv')
        self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=0.20,
                       callbacks=[tboard_cb, csv_logger])

class CNNdefault:
    """
    This class contains a CNN architecture with the default hyperparameters to 
    model Website Traffic Fingerprinting using the direction information from 
    undefended data
    The changes to the hyperparameter on lines 17 will be applied to this 
    architecture.
    """
    def __init__(self, num_features, num_classes):
        """
        :param num_features: number of features (columns) in the data (X)
        :param num_classes: number of unique labels in the data (number of
            websites)
        """
        model = Sequential()
        num_filters = [4, 4]
        filter_sizes = [5, 5]
        l2_lambda = 0.0001
        # activation: 'tanh', 'relu', 'elu', 'selu', 'sigmoid' 
        # layer 1
        model.add(Conv1D(num_filters[0], filter_sizes[0]
                        , input_shape=(num_features, 1), padding="same"
                         , activation='tanh', kernel_regularizer=l2(l2_lambda)))
        # Hidden layers
        #model.add(BatchNormalization())
        model.add(Conv1D(num_filters[1], filter_sizes[1], activation='tanh'
                         , kernel_regularizer=l2(l2_lambda)))
        #model.add(BatchNormalization())
        model.add(MaxPooling1D(2))        
        model.add(Flatten())
        # dropout layer
        model.add(Dropout(0.5))
        # final layer
        model.add(Dense(num_classes, activation='softmax'))

        # optimization function
        #opti = RMSprop(lr=0.03)
        #opti = Adam(lr=0.03)
        #opti = Adamax(lr=0.03)
        opti = SGD(lr=0.03)
        model.compile(loss='categorical_crossentropy', optimizer=opti, 
                      metrics=['accuracy'])
        self.model = model
        print(self.model.summary())
        

    def fit(self, X_train, Y_train, batch_size, epochs, verbose):
        """
        :param X_train: a numpy ndarray of dimension (k x n) containing
            training data
        :param Y_train: a numpy ndarray of dimension (k x 1) containing
            labels for X_train
        :param batch_size: batch size to use for training
        :param epochs: number of epochs for training
        :param verbose: Console print options for training progress.
            0 - silent mode,
            1 - progress bar,
            2 - one line per epoch
        :return: None

        This method start training the model with the given data. The
        training options are configured with tensorboard and early stopping
        callbacks.

        Tensorboard could be launched by navigating to the directory
        containing this file in terminal and running the following command.
            > tensorboard --logdir graph
        """
        tboard_cb = TensorBoard(log_dir='./graph/cnn-basic', histogram_freq=0,
                                write_graph=True, write_images=True)
        csv_logger = CSVLogger('logBasic.csv')

        # Uncomment lines 323 - 326 and comment lines 328 - 330 to implement 
        # early stopping

        #early_stopping_cb = EarlyStopping(monitor="val_loss", patience=3)
        #self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
        #               verbose=verbose, validation_split=0.20,
        #               callbacks=[tboard_cb, early_stopping_cb, csv_logger])
        
        self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=0.20,
                       callbacks=[tboard_cb, csv_logger])
        
class CNNbest:
    """
    This class contains a CNN architecture with the best possible hyperparameters
    to model Website Traffic Fingerprinting using the direction information from 
    undefended data.
    The changes to the hyperparameters on lines 17, 19, 21, 23 and 25 will be 
    applied to this architecture.
    """
    def __init__(self, num_features, num_classes):
        """
        :param num_features: number of features (columns) in the data (X)
        :param num_classes: number of unique labels in the data (number of
            websites)
        """
        model = Sequential()
        num_filters = [4, 4]
        filter_sizes = [5, 5]
        l2_lambda = 0.0001
        # layer 1
        model.add(Conv1D(num_filters[0], filter_sizes[0]
                        , input_shape=(num_features, 1), padding="same"
                         , activation='relu', 
                         kernel_regularizer=l2(l2_lambda)))
        # hidden layers
        model.add(BatchNormalization())
        model.add(Conv1D(num_filters[1], filter_sizes[1], 
                         activation='relu'
                         , kernel_regularizer=l2(l2_lambda)))
        model.add(BatchNormalization())
        model.add(MaxPooling1D(2))
        model.add(Flatten())
        model.add(Dropout(0.9))
        # final layer
        model.add(Dense(num_classes, activation='softmax'))

        # optimization function
        opti = SGD(lr=0.03)
        model.compile(loss='categorical_crossentropy', optimizer=opti, 
                      metrics=['accuracy'])
        self.model = model
        print(self.model.summary())
        plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=False)

    def fit(self, X_train, Y_train, batch_size, epochs, verbose):
        """
        :param X_train: a numpy ndarray of dimension (k x n) containing
            training data
        :param Y_train: a numpy ndarray of dimension (k x 1) containing
            labels for X_train
        :param batch_size: batch size to use for training
        :param epochs: number of epochs for training
        :param verbose: Console print options for training progress.
            0 - silent mode,
            1 - progress bar,
            2 - one line per epoch
        :return: None

        This method start training the model with the given data. The
        training options are configured with tensorboard and early stopping
        callbacks.

        Tensorboard could be launched by navigating to the directory
        containing this file in terminal and running the following command.
            > tensorboard --logdir graph
        """
        tboard_cb = TensorBoard(log_dir='./graph/cnn-best', histogram_freq=0,
                                write_graph=True, write_images=True)
        
        # Uncomment lines 412 - 415 and comment lines 417 - 420 to implement 
        # early stopping

        #early_stopping_cb = EarlyStopping(monitor="val_loss", patience=3)
        #self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
        #               verbose=verbose, validation_split=0.20,
        #               callbacks=[tboard_cb, early_stopping_cb])

        csv_logger = CSVLogger('logBest.csv')
        self.model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
                       verbose=verbose, validation_split=0.20,
                       callbacks=[tboard_cb, csv_logger])

def main():
    # Load the data and create X and Y matrices
    data = get_data()
    num_features = data.shape[1] - 1
    X = data[:, :num_features]
    Y = data[:, -1]
    
    # split the data into training and test set
    X_train, Y_train, X_test, Y_test = split_data(X, Y, 0.85, balance_dist=True)        
    X_train = np.expand_dims(X_train, axis=2)
    X_test = np.expand_dims(X_test, axis=2)
    Y_train = to_categorical(Y_train)
    Y_test = to_categorical(Y_test)
    print(X[0])

    # instantiate the model and train on the data
    # Set the epoch value (default was 500, with early stopping), works for 
    # current datasets with 30 and early stopping disabled.

    # Uncomment to instantiate and train the default CNN model
    #model = CNNdefault(num_features, Y_train.shape[1])
    #model.fit(X_train, Y_train, batch_size=25, epochs=30, verbose=1)

    # Uncomment to instantiate and train the best CNN model
    model = CNNbest(num_features, Y_train.shape[1])
    model.fit(X_train, Y_train, batch_size=25, epochs=30, verbose=1)
    
    # Uncomment to instantiate and train the AWF CNN model 
    #model = AWF(num_features, Y_train.shape[1])
    #model.fit(X_train, Y_train, batch_size=128, epochs=30, verbose=1)
    
    # Uncomment to instantiate and train the DF CNN model 
    #model = DF(num_features, Y_train.shape[1])
    #model.fit(X_train, Y_train, batch_size=128, epochs=30, verbose=1)
    
    # Evaluate the trained model on test data and print the accuracy
    
    score = model.model.evaluate(X_test, Y_test, batch_size=100)
    accuracy = round(score[1]*100, 2)
    loss = round(score[0], 2)
    print("Test accuracy: ", round(score[1]*100, 2))
    print("Test loss: ", round(score[0], 2))
    return accuracy, loss

In [11]:
if __name__ == '__main__':
  main()


loading data...




done
[ 1 -1 -1 ...  0  0  0]
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_2 (Conv1D)           (None, 8000, 4)           24        
                                                                 
 batch_normalization_2 (Batc  (None, 8000, 4)          16        
 hNormalization)                                                 
                                                                 
 conv1d_3 (Conv1D)           (None, 7996, 4)           84        
                                                                 
 batch_normalization_3 (Batc  (None, 7996, 4)          16        
 hNormalization)                                                 
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 3998, 4)          0         
 1D)                                                             
                         

In [None]:
%tensorboard --logdir graph/