jueves, 25 de octubre de 2018

CV para xgboost

ROC <- function( ModelProb , test, SAVE = F, LABEL = "", DIR= "") {
  require ('ROCR')
  require ('rpart')

  if(SAVE){
    if(DIR != "" ){
      if(substr(x = DIR, start = nchar(DIR), stop = nchar(DIR))  != "/"  ){
        DIR <- paste0(DIR,"/")
      }     
    }
    jpeg(filename = paste0(DIR ,LABEL,"ROC.jpeg") , width = 20, height = 20, units = "cm", res = 250, bg = "white" )
    par(las = 1)
  }
  pred <- prediction (ModelProb,  test [,ncol(test)])
  perf <- performance (pred, 'tpr', 'fpr')
  gini <- (unlist (performance (pred, measure = 'auc')@y.values) - .5) / .5
  ks <- max (unlist (perf@y.values) - unlist (perf@x.values))
  plot (x= 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values), type= 'l', lwd = 3
        , col =2, xaxs = 'i', yaxs = 'i', yaxt = 'n', xlab = '% falsos positivos', ylab = '% verdaderos positivos')
  abline (a=0, b=1, lty = 5)
  legend ('bottomright', 'hi', paste0 ('Gini = ', round (100*gini, 1), '%\n', 'Ks = ', round (100*ks,1), '%\n\n'))
  if(SAVE){
    dev.off()
  }
  return( list(gini = gini , ks = ks, aucDf = data.frame( real = 100 * unlist (perf@x.values), y = 100 * unlist (perf@y.values))))
}

FuncionCV <-
  function( dtrain_dt, labelVector
          , PosEta   = seq( from = 0.15, to = 0.35, by = 0.1)
          , PosDep   = seq( from = 1, to = 5, by = 2)
          , PosLoops = seq( from = 200, to = 300, by = 50)
          , objective = "binary:logistic"
          , nfold = 10, missing = "NAN", eval_metric = "auc"){

  ConfiDep <- c()
  ConfiLoops <- c()
  ConfiEta <- c()
  AUCres <- c()
  j <- 1


  for(loops in PosLoops){
    cat( "\n=====================\n", loops, "\n=====================\n" )
    for(maxD in PosDep){
      for(Ceta in PosEta){
     
        model_check <- xgb.cv (data = dtrain,
                               label = label,
                               nrounds = loops,
                               eta = Ceta,
                               max_depth = maxD,
                               nfold = nfold, # Con 10 es que coge uno a 10.
                               objective = objective,
                               eval_metric = eval_metric,
                               pred = TRUE,
                               missing = missing)
     
        val <- glmnet::auc( y = label
                            , prob = model_check$pred)
     
        ROC(ModelProb = model_check$pred, test = data.frame(label), SAVE = FALSE )
        title( paste0( "eta = ", Ceta, ", max_depth =", maxD,", nrounds = ",loops ) )
     
        AUCres[j] <- val
        ConfiDep[j] <- maxD
        ConfiEta[j] <- Ceta
        ConfiLoops[j] <- loops
        j <- j + 1
      }
    }
  }

  Resumen <-
    data.frame( AUCres = AUCres
              , GiniRes = AUCres *2 -1
              , ConfiDep = ConfiDep
              , ConfiEta = ConfiEta
              , ConfiLoops = ConfiLoops)

  return( Resumen)
}

No hay comentarios:

Publicar un comentario