Run the demo#

Note

The tutorial is for FuxiCTR v1.2.

We provide multiple demo scripts to run a given model on the tiny dataset. Please follow these examples to get started. The code workflow is structured as follows:

# Set data params and model params
params = {...}

# Define the feature encoder with feature encoding specs
feature_encoder = FeatureEncoder(feature_cols, label_col, ...) # 

# Build dataset from csv to h5
datasets.build_dataset(feature_encoder, train_data, valid_data, test_data)

# Get feature_map that are required for data loading and model training.
feature_map = feature_encoder.feature_map

# Load data generators
train_gen, valid_gen = datasets.h5_generator(feature_map, ...)
                                             
# Define a model
model = DeepFM(feature_map, ...)

# Train the model
model.fit_generator(train_gen, validation_data=valid_gen, ...)

# Load test data generator and evaluation
test_gen = datasets.h5_generator(feature_map, ...)
model.evaluate_generator(test_gen)

.

In the following, we show the demo DeepFM_demo.py.

import os
import logging
from datetime import datetime
from fuxictr import datasets
from fuxictr.datasets.taobao import FeatureEncoder
from fuxictr.features import FeatureMap
from fuxictr.utils import load_config, set_logger, print_to_json
from fuxictr.pytorch.models import DeepFM
from fuxictr.pytorch.torch_utils import seed_everything

After importing the required packages, one needs to define the params dict for DeepFM.

feature_cols = [{'name': ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
                          "cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
                 'active': True, 'dtype': 'str', 'type': 'categorical'}]
label_col = {'name': 'clk', 'dtype': float}

params = {'model_id': 'DeepFM_demo',
          'dataset_id': 'taobao_tiny',
          'train_data': '../data/tiny_data/train_sample.csv',
          'valid_data': '../data/tiny_data/valid_sample.csv',
          'test_data': '../data/tiny_data/test_sample.csv',
          'model_root': '../checkpoints/',
          'data_root': '../data/',
          'feature_cols': feature_cols,
          'label_col': label_col,
          'embedding_regularizer': 0,
          'net_regularizer': 0,
          'hidden_units': [64, 64],
          'hidden_activations': "relu",
          'learning_rate': 1e-3,
          'net_dropout': 0,
          'batch_norm': False,
          'optimizer': 'adam',
          'task': 'binary_classification',
          'loss': 'binary_crossentropy',
          'metrics': ['logloss', 'AUC'],
          'min_categr_count': 1,
          'embedding_dim': 10,
          'batch_size': 16,
          'epochs': 3,
          'shuffle': True,
          'seed': 2019,
          'monitor': 'AUC',
          'monitor_mode': 'max',
          'use_hdf5': True,
          'pickle_feature_encoder': True,
          'save_best_only': True,
          'every_x_epochs': 1,
          'patience': 2,
          'num_workers': 1,
          'partition_block_size': -1,
          'verbose': 1,
          'version': 'pytorch',
          'gpu': -1}

# Set the logger and random seed
set_logger(params)
logging.info(print_to_json(params))
seed_everything(seed=params['seed'])

Then set the FeatureEncoder to fit the training data and encode the raw features (e.g., normalizing continious values and mapping/reindex categorical features) from csv files.

# Set feature_encoder that defines how to preprocess data
feature_encoder = FeatureEncoder(feature_cols, 
                                 label_col, 
                                 dataset_id=params['dataset_id'], 
                                 data_root=params["data_root"])

# Build dataset from csv to h5
datasets.build_dataset(feature_encoder, 
                       train_data=params["train_data"], 
                       valid_data=params["valid_data"], 
                       test_data=params["test_data"])

Preprocess the csv files to h5 files and get the data generators ready for train/validation/test. Note that the h5 files can be reused for subsequent experiments directly.

# Get feature_map that defines feature specs
feature_map = feature_encoder.feature_map

# Get train and validation data generator from h5
data_dir = os.path.join(params['data_root'], params['dataset_id'])
train_gen, valid_gen = datasets.h5_generator(feature_map, 
                                             stage='train', 
                                             train_data=os.path.join(data_dir, 'train.h5'),
                                             valid_data=os.path.join(data_dir, 'valid.h5'),
                                             batch_size=params['batch_size'],
                                             shuffle=params['shuffle'])

Initialize a DeepFM model and fit the model with the training and validation data.

model = DeepFM(feature_map, **params)
model.count_parameters() # print number of parameters used in model
model.fit_generator(train_gen, 
                    validation_data=valid_gen, 
                    epochs=params['epochs'],
                    verbose=params['verbose'])

Reload the saved best model checkpoint for testing.

model.load_weights(model.checkpoint) # reload the best checkpoint
logging.info('***** validation results *****')
model.evaluate_generator(valid_gen)

logging.info('***** validation results *****')
test_gen = datasets.h5_generator(feature_map, 
                                 stage='test',
                                 test_data=os.path.join(data_dir, 'test.h5'),
                                 batch_size=params['batch_size'],
                                 shuffle=False)
model.evaluate_generator(test_gen)