Run the demo
Run the demo#
Note
The tutorial is for FuxiCTR v1.0.
We provide multiple demo scripts to run a given model on the tiny dataset. Please follow these examples to get started. The code workflow is structured as follows:
# Set data params and model params
params = {...}
# Set the feature encoding specs
feature_encoder = FeatureEncoder(feature_cols, label_col, ...) # define the feature encoder
feature_encoder.fit(...) # fit and transfrom the data
# Load data generators
train_gen, valid_gen, test_gen = data_generator(feature_encoder, ...)
# Define a model
model = DeepFM(...)
# Train the model
model.fit_generator(train_gen, validation_data=valid_gen, ...)
# Evaluation
model.evaluate_generator(test_gen)
.
In the following, we show the demo DeepFM_demo.py
.
import sys
import os
from fuxictr.datasets import data_generator
from fuxictr.datasets.taobao import FeatureEncoder
from datetime import datetime
from fuxictr.utils import set_logger, print_to_json
import logging
from fuxictr.pytorch.models import DeepFM
from fuxictr.pytorch.utils import seed_everything
After importing the required packages, one needs to define the params dict for DeepFM.
feature_cols = [{'name': ["userid","adgroup_id","pid","cate_id","campaign_id","customer","brand","cms_segid",
"cms_group_id","final_gender_code","age_level","pvalue_level","shopping_level","occupation"],
'active': True, 'dtype': 'str', 'type': 'categorical'}]
label_col = {'name': 'clk', 'dtype': float}
params = {'model_id': 'DeepFM_demo',
'dataset_id': 'tiny_data_demo',
'train_data': '../data/tiny_data/train_sample.csv',
'valid_data': '../data/tiny_data/valid_sample.csv',
'test_data': '../data/tiny_data/test_sample.csv',
'model_root': '../checkpoints/',
'data_root': '../data/',
'feature_cols': feature_cols,
'label_col': label_col,
'embedding_regularizer': 0,
'net_regularizer': 0,
'hidden_units': [64, 64],
'hidden_activations': "relu",
'learning_rate': 1e-3,
'net_dropout': 0,
'batch_norm': False,
'optimizer': 'adam',
'task': 'binary_classification',
'loss': 'binary_crossentropy',
'metrics': ['logloss', 'AUC'],
'min_categr_count': 1,
'embedding_dim': 10,
'batch_size': 16,
'epochs': 3,
'shuffle': True,
'seed': 2019,
'monitor': 'AUC',
'monitor_mode': 'max',
'use_hdf5': True,
'pickle_feature_encoder': True,
'save_best_only': True,
'every_x_epochs': 1,
'patience': 2,
'workers': 1,
'verbose': 0,
'version': 'pytorch',
'gpu': -1}
# Set the logger and random seed
set_logger(params)
logging.info('Start the demo...')
logging.info(print_to_json(params))
seed_everything(seed=params['seed'])
Then set the FeatureEncoder to fit the training data and encode the raw features (e.g., normalizing continious values and mapping/reindex categorical features) from csv files.
feature_encoder = FeatureEncoder(feature_cols,
label_col,
dataset_id=params['dataset_id'],
data_root=params["data_root"],
version=params['version'])
feature_encoder.fit(train_data=params['train_data'],
min_categr_count=params['min_categr_count'])
Preprocess the csv files to h5 files and get the data generators ready for train/validation/test. Note that the h5 files can be reused for subsequent experiments directly.
train_gen, valid_gen, test_gen = data_generator(feature_encoder,
train_data=params['train_data'],
valid_data=params['valid_data'],
test_data=params['test_data'],
batch_size=params['batch_size'],
shuffle=params['shuffle'],
use_hdf5=params['use_hdf5'])
Initialize a DeepFM model and fit the model with the training and validation data.
model = DeepFM(feature_encoder.feature_map, **params)
model.fit_generator(train_gen, validation_data=valid_gen, epochs=params['epochs'],
verbose=params['verbose'])
Reload the saved best model checkpoint for testing.
logging.info('***** validation/test results *****')
model.load_weights(model.checkpoint)
model.evaluate_generator(valid_gen)
model.evaluate_generator(test_gen)