Run the demo#

Note

The tutorial is for FuxiCTR v1.0.

We provide multiple demo scripts to run a given model on the tiny dataset. Please follow these examples to get started. The code workflow is structured as follows:

# Set data params and model params
params = {...}

# Set the feature encoding specs
feature_encoder = FeatureEncoder(feature_cols, label_col, ...) # define the feature encoder
feature_encoder.fit(...) # fit and transfrom the data

# Load data generators
train_gen, valid_gen, test_gen = data_generator(feature_encoder, ...)

# Define a model
model = DeepFM(...)

# Train the model
model.fit_generator(train_gen, validation_data=valid_gen, ...)

# Evaluation
model.evaluate_generator(test_gen)

.

In the following, we show the demo DeepFM_demo.py.

import sys
import os
from fuxictr.datasets import data_generator
from fuxictr.datasets.taobao import FeatureEncoder
from datetime import datetime
from fuxictr.utils import set_logger, print_to_json
import logging
from fuxictr.pytorch.models import DeepFM
from fuxictr.pytorch.utils import seed_everything

After importing the required packages, one needs to define the params dict for DeepFM.

feature_cols = [{'name': ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
                          "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
                 'active': True, 'dtype': 'str', 'type': 'categorical'}]
label_col = {'name': 'clk', 'dtype': float}

params = {'model_id': 'DeepFM_demo',
          'dataset_id': 'tiny_data_demo',
          'train_data': '../data/tiny_data/train_sample.csv',
          'valid_data': '../data/tiny_data/valid_sample.csv',
          'test_data': '../data/tiny_data/test_sample.csv',
          'model_root': '../checkpoints/',
          'data_root': '../data/',
          'feature_cols': feature_cols,
          'label_col': label_col,
          'embedding_regularizer': 0,
          'net_regularizer': 0,
          'hidden_units': [64, 64],
          'hidden_activations': "relu",
          'learning_rate': 1e-3,
          'net_dropout': 0,
          'batch_norm': False,
          'optimizer': 'adam',
          'task': 'binary_classification',
          'loss': 'binary_crossentropy',
          'metrics': ['logloss', 'AUC'],
          'min_categr_count': 1,
          'embedding_dim': 10,
          'batch_size': 16,
          'epochs': 3,
          'shuffle': True,
          'seed': 2019,
          'monitor': 'AUC',
          'monitor_mode': 'max',
          'use_hdf5': True,
          'pickle_feature_encoder': True,
          'save_best_only': True,
          'every_x_epochs': 1,
          'patience': 2,
          'workers': 1,
          'verbose': 0,
          'version': 'pytorch',
          'gpu': -1}

# Set the logger and random seed
set_logger(params)
logging.info('Start the demo...')
logging.info(print_to_json(params))
seed_everything(seed=params['seed'])

Then set the FeatureEncoder to fit the training data and encode the raw features (e.g., normalizing continious values and mapping/reindex categorical features) from csv files.

feature_encoder = FeatureEncoder(feature_cols, 
                                 label_col, 
                                 dataset_id=params['dataset_id'], 
                                 data_root=params["data_root"],
                                 version=params['version'])
feature_encoder.fit(train_data=params['train_data'], 
                    min_categr_count=params['min_categr_count'])

Preprocess the csv files to h5 files and get the data generators ready for train/validation/test. Note that the h5 files can be reused for subsequent experiments directly.

train_gen, valid_gen, test_gen = data_generator(feature_encoder,
                                                train_data=params['train_data'],
                                                valid_data=params['valid_data'],
                                                test_data=params['test_data'],
                                                batch_size=params['batch_size'],
                                                shuffle=params['shuffle'],
                                                use_hdf5=params['use_hdf5'])

Initialize a DeepFM model and fit the model with the training and validation data.

model = DeepFM(feature_encoder.feature_map, **params)
model.fit_generator(train_gen, validation_data=valid_gen, epochs=params['epochs'],
                    verbose=params['verbose'])

Reload the saved best model checkpoint for testing.

logging.info('***** validation/test results *****')
model.load_weights(model.checkpoint)
model.evaluate_generator(valid_gen)
model.evaluate_generator(test_gen)