Neurociencia y Matemáticas: agosto 2018

En R es superfacil, Python, para variar, es un infierno. Pero siempre habrá quien defienda ese lenguaje no solo por su rapidez que la tiene, pero...

def descripcionDF (DFSpark, StringBool = True, timeStampBool = True, floatBool = True):
tipoVar = pd.DataFrame( DFSpark.dtypes, columns= ('variable', 'tipo') )
print( pd.crosstab(index=tipoVar.tipo, # Make a crosstab
columns="count") )

print( "========================================================================")
print( "Datos String")
print( "========================================================================")

x = tipoVar.variable [ tipoVar.tipo =='string' ].values.tolist()
if len(x) > 0 & StringBool:
for i in x: print( DFSpark.cube(i).count().orderBy(desc('count')).show() )

print( "========================================================================")
print( "Datos timeStamp")
print( "========================================================================")
x = tipoVar.variable [ tipoVar.tipo =='timestamp' ].values.tolist()

if len(x) > 0 & timeStampBool:
DFSparkFechas = DFSpark.select(x)
for i in x:
DFSparkFechas = DFSparkFechas.withColumn(i + '_Fecha', (year(i)* 1e10 + month(i)* 1e8 + dayofmonth(i)* 1e6 +\
hour(i)* 1e4 + minute(i) * 1e2 + second(i) ).cast(LongType()) )
print( DFSparkFechas.describe().show() )

print( "========================================================================")
print( "Datos float")
print( "========================================================================")
if len(tipoVar.variable [ tipoVar.tipo =='float' ]) > 0 & floatBool:
DFSpark.select(tipoVar.variable [ tipoVar.tipo =='float' ]).describe( ).show()

Neurociencia y Matemáticas

miércoles, 29 de agosto de 2018

Descriptivas PySpark