large-data-R/large-data-functions-examples.r at master · miguelvb/large-data-R · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#########################################
#
# LARGE DATA AND R
# A COMPILATION OF EXAMPLES ON HOW TO USE CUSTOM FUNCTIONS
# TO AVOID MEMORY PROBLEMS AND HAVE AN EFFICIENT CODE

######################################

######################################
# This is a work in progress.
# Please contact me on miguelvb@yahoo.com to ask any question.
# this files are updated in
# www.github.com/miguelvb/large-data-R
# tutorial and R introduction in
# www.repidemiology.wordpress.com
######################################


# source and directory ::

dirdata <- "C:/Users/jkc261/large-data-R" # YUO MUST CHANGE THIS DIRECTORY TO YOURS....

setwd(dirdata) # change this directory for your computer.
filename.functions <- paste(dirdata,"functions_large_data.r",sep="/")
filename.cpp <- paste(dirdata,"subset_df.cpp",sep="/")

# you must have intalled all these packages :

require(Rcpp); require(data.table); require(ffbase); require(ggplot2); require(grid); require(inline); require(sqldf)

# sourcing the files containing functions :

source(filename.functions) # this contains some functions needed .
sourceCpp(filename.cpp) # Rcpp functions.


# 1 . MOTIVATION

# R is an open source programming language that is widely used in statistical computations.
# More and more researchers and Universities are using R as the main statistical analysis language.
# R packages ("extra" functionalities) exist for almost any specific task that one would have to use.
# R graphics capabilities are exceptional, and allow  displaying data in almost any customized way.
# This makes R one of the best options to use in statistical analysis.

# 2.  THE PROBLEM OF LARGE DATA

# A. MEMORY :

# R is using RAM memory to store computed, computing and temporal data.
# For that reason, RAM overflow errors are common when we use large data in R.
# There is not only the limitation of the RAM capacity, but also the limitation of contiguous free RAM blocks:
# to be able to store a vector in R, we need a contiguous free space in RAM memory for that vector. And sometimes, while having still some hundreds of megabytes of
# free memory, we do not have any contiguous free block bigger that a few megabytes. Then, any large vector will not be able to fit into memory and we will get an error.

# This memory problem in R is one of the main reasons why R is not used in Large Data Statistical Analysis, and solving this issue would make
# possible the use of R for many new tasks, universities, research, and new scientific fields.


# B. SPEED:

# Dealing with large data needs efficient code to speed up computations. Non efficient code can result in extremly slow processes,
# that can last hours or days instead of seconds or minutes.
# So special care must be done to produce efficient code, using native R code or some packages and/or other languages.


# 3. SOLUTION TO LARGE DATA IN R

# The solution we found to deal with large data in R is a combination of several procedures and new functions.
# Here we will describe a summary of them.

# A. Solving memory issues:

# To solve memory problems we need to write data into disk, instead of loading into RAM memory.
# The package ff and ffbase allow to do this and still do very efficient code.

# Some main functions of ffbase were modified or re-coded to allow more efficiency or avoid memory problems, that still exist using the ffbase package.

# The result is that to solve memory problems we use ffbase + some new functions, that avoid almost all the possible memory problems in R, while still keeping efficient code.

# B. Speed in Large Data :

# For large data analysis, speed is an important factor, because the differences between using optimized code or
# not can make the calculations in R faster in a factor or 10, 100 or 1000 times, making a huge difference in the possible use and workflow with the data.

# To accomplish a speed-optimized code, we have used 3 main procedures :

# i . Efficient R code.

#    Using vectorized functions, efficient built-in functions, etc. This will make the code several orders of magnitude faster than other codes (see details in the tutorial files)

# ii. Use data.table package for "group by" and split analysis.

#     data.table is a very efficient package that allows to compute "group by", splits, etc. analysis in a very efficient way.
#     we use it for almost any "group by" analysis and for other specific tasks.

# iii. Rcpp: C++ code in R

# The language C++ is one of the fastests codes that we can create in computers.
# The package "Rcpp" and "inline" , allow to create C++ code in R in an "easy" way.
# For some tasks, Rcpp code is thousands of times faster than R code.
# We have developed some Rcpp functions to work with large data.
# We have also made some tutorials about how, when, and why use Rcpp into R code.

# C : combining speed and memory :

# All those speed-efficient techniques are combined with ffbase to compute tasks into blocks that can fit into RAM memory, write the result in disk,
# and proceed with the next block of data loaded into RAM.
# In this way, we obtain the benefits of speed,  avoiding memory problems.


# 4. COMPUTERS CHARACTERISTICS .

# Due to this memory problem in R, a good computer to work with R would be one with a good RAM memory, and working in 64bits.
# From 16GB / 64bits, we have already a good comptuter that will be able to deal with large data.

# The work we present here is done while working in a Windows 32bits computer with 4gb or RAM, which is almost the "worst" computer we can get nowadays to work with R.
# Having this computer, we could work in a very efficient way with databases containing dozens of millions records and a few dozens of columns.
# This means that using the proposed techniques and functions, any computer will be able to deal with large data in R.
# And this opens a new possibility for the use of R with large data, for example for less rich countries or for the common desktop computers.


# 5. EXAMPLE :

# We provide an example that covers the main techniques and packages used in our work with large data sets.
# It is not a comprehensive list of all the functions and techniques we have done. For more details see the tutorial files and the appendix.


# check memory and make it bigger ::

gc()  # this is the "garbage collector" that frees memory and display the memory usage. Use it frequently .

# used (Mb) gc trigger (Mb) max used (Mb)
# Ncells 217522  5.9     407500 10.9   350000  9.4
# Vcells 207013  1.6     905753  7.0   786432  6.0

my_memory <- 4000 # the total RAM memory of the computer (use another number for your own computer).
memory.limit(size=my_memory)  # put the limit to the computer RAM. This allows R to use all the memory avaiable.
memory.size() # show how much we are using

# load ffbase ::

# ffbase is working with data almost in the same way as R, but data are written into disk
# and only some parts are loaded into memory to compute. Results are written again into disk.
# ffdf do not have "character" vectors, but "factor" columns. So if we have some character vector in R, we must convert to factor.
# Factor levels are loaded into RAM. So if we have, say 4 million levels in a factor vector, those levels will be in RAM. This can cause memory problems.

# we generate data, with 20 million of rows ::
# This is what we want to have :

#    ident    date      diagnose    region
#   12534   2000-01-23   F           020

# we can think of those data as diagnoses made in hospitals in some country. ident would be the identification number for the subject.

gc() # free some memory
Ndata = 20e6;  # number of observations. 20 million in this example.

N_ids = Ndata/10 # number of unique idents.

memory.size() #check memory

ident = factor( sample( 1000:(N_ids+1000), Ndata, replace = T)) # make ident as a sample from 1000 to 20 001 000


memory.size() # see how much is into memory now.
size <- object.size(ident); print(size , units = "Mb") # ident is taking 145 Mb into RAM memory.

gc() # free some memory !

df <- ffdf( ident = ff(ident))  # create a ffdf database with a column ff( vector). This is written into disk.

rm(ident) # we do not need it anymore, remember to remove "!!!!
# check df :
str(df)  # that gives a complex list of things...

filename(df) # where is it stored ???

# make the dates data ::

fdates <- ff(sample( 0:10000, Ndata, replace = T) + as.Date("1970-10-01"))
df$date <- fdates  # here we assign that vector to df.... if we change fdates, we will change df$date. This is diff from R.
head(df[1,])

# check classes ::

class(df$ident) # ff_vector.
str(df$ident)   # it is a integer, seen for ff. But it goes into RAM as a factor (ramclass = factor.)

# levels ::
length( levels(df$ident) ) # how many levels
levels(df$ident)[1:10] # the first 10 levels.

# if we subset by indexes, that are not ffvectors, we get a R data.frame (in RAM):
data <- df[1:10, ] ; data ; class(data)

# but if we subset by ffvectors, we get ffdf (Not in RAM, but in disk) ::

data <- df[ff(1:10), ] ; data ; class(data)

# this is very important to not to load into RAM while we do computations. !!!

# make diagnose data ::

fdiagnose <- ff(sample( letters, Ndata, replace = T )) # this gives an error because we are feeding character...
fdiagnose <- ff(sample( factor(letters), Ndata, replace = T )) # this is ok, factors...

gc()
df$diagnose <- fdiagnose
df[1:3,]

# region data ::
fregion = ff( factor( sample( c("000","013" ,"014", "015", "020" ,"025", "030", "035" ,"040", "042", "050" ,"055" ,"060" ,"065" ,"070" ,"076", "080", "090"), Ndata, replace=T )) )
df$region <- fregion
df[1:3,]
gc()


# now let us save the data::

setwd(dirdata) # change this directory for your computer.
ffdfsave(df, filename="df")  # this is moving the data from the temporary directory to the one we point to.

str(df[1,])
# 'data.frame':  1 obs. of  4 variables:

# $ ident   : Factor w/ 1999915 levels "1000","1001",..: 779169
# $ date    : Date, format: "1978-04-13"
# $ diagnose: Factor w/ 26 levels "a","b","c","d",..: 19
# $ region  : Factor w/ 18 levels "000","013","014",..: 17

# to use our own defined functions, we will source (load) them from files (change the path to your own path to those files) :

require(Rcpp)  # library to code C++ into R.
require(inline) # library to be able to load c++ functions into R code in the same R file.

source(filename.functions) # this contains some functions needed .
sourceCpp(filename.cpp) # Rcpp functions.

# order the data by ident and dates:

# we use our defined function: order_ffdf: this function orders the data, avoiding memory problems taking into several splits. Also can verbose the output.
cols <- c("ident", "date")
df_o <- order_ffdf(df,cols,2,T) # order_ffdf(data= ,order_cols=,splits=,verbose=)

df_first <- subset_ffdf(df_o,c("date"), date < as.Date("1971-10-09")) # our function  subset_ffdf(data=,condition_cols=,condition=,splits=,verbose=)
df_first[1:3,]

# apply a function to the data, splitting the data into blocks of a defined size but respecting same variable values.
# for example, splitting the data into one million rows, but not splitting rows with same "ident", i.e., keep  same idents, into a same split.
# for use this function, data MUST BE ORDERED by the variable we want to split for.

# split_apply_cpp(input_data=,split_vector=,chk_size_=,fu=)

# first we create a function to apply in each split. This function must give a data.frame or a data.table :
fu_date <- function(x){

  date <- x$date
  before_1973 <- ifelse(date < "1973-01-01",1L,0L) # ifelse is a vecorized R function.
  data <- data.frame(before_1973 = before_1973)
  data

}

# let us check with some RAM data :
data <- df_o[1:50,]
fu_date(data)

# apply that to all the data. In splits of 1e6, respecting same idents. So we will have only aprox. 1e6 rows f data into RAM memory.
# a good way to aprox. know the size of the split is to make some RAM data and see the size :

data_RAM <- df_o[1:1e6,]
size <- object.size(data_RAM); print(size , units = "Mb") # 122 Mb. then we could take as much as 3e6 perhaps.. (I take around 1/10 of total ram as a safe indicator)
rm(data_RAM)

chunk_size <- 3e6
result <- split_apply_cpp(df_o,df_o$ident, chunk_size ,fu_date) # 40 seconds.  # remember data MUST be ordered by the split variable ("ident" in this example).

gc() # free memory

levels(df_o$diagnose)

# now, let us apply a more complex function: we want to know for each ident, if they were diagnosed with code "a", "b", or "c".
# then we need to make a split-apply-combine procedure.

# To make group_by tasks we will use data.table, which is very efficient.

library(data.table)

# our function (we think we apply this to data with same "ident")

get_diag_abc <- function(x){ any(x %in% c("a","b","c"))  } # gives TRUE if there is "a","b" or "c" in the values of x.

# a test:
diag <- df_o$diagnose[1:30] # the first 30 diagnoses.
get_diag_abc(diag)  # result. ok.

# now we want to apply that to data :

fu_diagnose_abc <- function(x){

  x <- data.table(x)                 # make a data.table.
  setkeyv(x, c("ident","date"))      # index (order) by ident and date .
  y <- x[,  list(diagnose_abc = get_diag_abc(diagnose)), by = ident]   # make a vector diagnose_abc as the any(---) for each ident.
  y
}

#test:
data <- df_o[1:1000,]
res <- fu_diagnose_abc(data); res[1:20] # ok, works.
#     ident diagnose_abc
# 1:  1000        FALSE
# 2:  1001        FALSE

# proceed to calculate that for all the 20 million data :

chunk_size <- 3e6
result <- split_apply_cpp(df_o,df_o$ident, chunk_size ,fu_diagnose_abc) # 2 minutes.  # remember data MUST be ordered by the split variable ("ident" in this example).

# the resulting data base is a ffdf object: it is written into disk and does not load into RAM memory.
gc()

# merging data:
# now we want to merge that result and the previous data. It is a merge as when in SQL we are merging in this way:
# WHERE t1.ident = t2.ident ,
# i.e. we only have as results the row which have values in EACH of the databases. We eliminate rows with values NOT in both data.
cols <- c("ident")
split <- c("ident")

merged <- merge_by(df_o,result,cols,split,1e6) # 5 minutes . # merge_by(x=,y=,keycols=,split_name=,chk=)

dim(df_o)
dim(result)
dim(merged)

# note that we have done the split and the subsequent merge. Better to do that in data.table in one single function:

fu_diagnose_abc_2 <- function(x){

  x <- data.table(x)                 # make a data.table.
  setkeyv(x, c("ident","date"))      # index (order) by ident and date .
  y <- x[,  diagnose_abc := get_diag_abc(diagnose), by = ident]  # now add that column.
  y
}


chunk_size <- 3e6
result_2 <- split_apply_cpp(df_o,df_o$ident, chunk_size ,fu_diagnose_abc_2) # 3.4 minutes.  # remember data MUST be ordered by the split variable ("ident" in this example).

result_2[1:2,]
# ident          date diagnose region diagnose_abc
# 1   1000 1973-01-30        l    000        FALSE
# 2   1000 1974-10-16        e    070        FALSE


# another way to make split-apply task is to use a loop over data.
# loops over rows are almost forbidden in R, because they are EXTREMLY slow.
# But in Rcpp, loops over the rows are very fast.

# here is an example of Rcpp code that is doing the equivalent of our previous code:
# As in the previous code, data MUST be ordered by "ident" before applying the function.

require(inline)
require(Rcpp)

src <- '
  #include <string>
  #include <iostream>   //this will allow == between strings...

  using namespace Rcpp;
  using namespace std;

  IntegerVector ident = as<IntegerVector>(ident_);
  CharacterVector diagnose_v = as<CharacterVector>(diagnose_) ;  //

  int N = ident.size();

  IntegerVector diagnose_abc(N,0);
  IntegerVector ident_idx(N,0);

  int NewId = 0;
  int OldId = -1;

  int ident_index = 0;

  int diagnose_abc_int = 0;

for (int i = 0; i < N; i++){

    NewId = ident[i];
    string diagnose = as<string>(diagnose_v[i]);
    //printf("diagnoses abc: %s", diagnose );

    if(NewId == OldId){

      ident_index += 1;
      ident_idx[ident_index] = i;

      if( diagnose == "a" || diagnose == "b" || diagnose == "c"  ) {

        diagnose_abc_int = 1;
        //cout << "diagnoses abc: " <<  i << endl;

      }

      // last case :: do not forget the last one if it is the same ident as the previous.

       if( i == (N-1)) {

           if(diagnose_abc_int == 1) {  // if we get the result .

              for( int k=0; k <= ident_index; k++) {

                 diagnose_abc[ ident_idx[k] ] = 1;  // all the diagnose_abc for that ident are going to be 1.
                 //printf("diagnoses abc: %d", ident_idx[k] );
              }
           }

       }

    } else {

       if(diagnose_abc_int == 1) {  // if we get the result .

         for( int k=0; k <= ident_index; k++) {

            diagnose_abc[ ident_idx[k] ] = 1;  // all the diagnose_abc for that ident are going to be 1.
            //printf("diagnoses abc: %d", ident_idx[k] );
         }

       }

       ident_index = 0;
       ident_idx[ident_index] = i;
       diagnose_abc_int = 0;

       if( diagnose == "a" || diagnose == "b" || diagnose == "c"  ) {

          diagnose_abc_int = 1;

       }

       OldId = NewId;

    }


  }

  return diagnose_abc; // return the vector of results...

  '

fun <- cxxfunction(signature(ident_ = "integer",diagnose_ = "integer"), src, plugin = "Rcpp") # defines the function....
gc()

ident <- df_o$ident[1:1000]
diagnose <- df_o$diagnose[1:1000]
res <- fun(ident, diagnose )
res
data <- data.frame(ident, diagnose, res)
data[1:50,]

# so the function is working and is extremly fast :

chunk_size <- 5e6 # try with 5 million rows, and check if we do not have memory problems.

fun_diagnose <- function(x){

  res <- fun(x$ident,x$diagnose)
  res <- data.frame(diagnose_abc = res)
  res

}
result_3 <- split_apply_cpp(df_o[c("ident", "diagnose")],df_o$ident, chunk_size ,fun_diagnose) # 40 seconds. The main computing time is used in append data and convert to ffdf.
                                                                                               # remember data MUST be ordered by the split variable ("ident" in this example).

# as C++ is very efficient, we could also try to get the function with the data in RAM :
gc()
ident <- df_o$ident[]  # load ident data into RAM.
diagnose <- df_o$diagnose[] # load diganose data into RAM.

system.time(result4 <- fun(ident,diagnose) )  # 3 seconds !!!

diag_abc <- as.integer(result_2$diagnose_abc[])

identical(diag_abc, result4)  # they are identical !
rm(ident,diagnose, diag_abc)

gc()

### This was an example of the different timings we can get using efficient code in R.
### The same computation as above, using aggregate, or plyr package, etc. makes the process longer than 30 minutes.
### Comparing our data.table + ffdf computation (3.4 minutes) or better our 3.2 seconds for Rcpp, we see that the order of magnitude of the improvements in speed we can gain with
### efficient coding can be of the order of 1600 / 3 = 600 times. (from 3 seconds to 30 minutes).
### If the data were larger, we would have a much bigger difference, sometimes making some "simple" tasks impossible for very large data sets if we do not use efficient code.


############### APENDIX ##############################


# SUMMARY OF CUSTOM FUNCTIONS  ::

# code to load the functions. Change directories to point to your local files.

require(Rcpp)  # library to code C++ into R.
require(inline) # library to be able to load c++ functions into R code in the same R file.

source(filename.functions) # this contains some functions needed .
sourceCpp(filename.cpp) # Rcpp functions.


################################################
############### order_ffdf #####################
################################################

# order_ffdf(data=,order_cols=,splits=,verbose=)
# will order a ffdf by the order column names, using splits. Can verbose the process.
# Using splits, can avoid memory problems found when using ffdforder or ffdfsort.

# example:

data <- ffdf( x = ff(sample(1:10)), y = ff(factor(sample(letters, 10, replace = T))), z = ff(1:10))
data[1:3,]
#     x y  z
# 1   3 a  1
# 2   1 q  2
# 3  10 b  3

cols <- c("x","z")
res <- order_ffdf(data, cols,2,T)
res[1:3,]
#   x y z
# 1 1 q 2
# 2 2 o 5
# 3 3 a 1

################################################

################################################
############### subset_ffdf ####################
################################################

# subset_ffdf(data=,condition_cols=,condition=,splits=,verbose=)

# subset a ffdf, by the condition columns, with a condition. Makes splits if needed and verbose.

# example:
cols <- c("z","y")
data[1:3,]
#    x y z
# 1  3 a 1
# 2  1 q 2
# 3 10 b 3

res <- subset_ffdf(data,cols, (z %in% c(1,2,7) ) |  ( y %in% c("a","b","u")  )     )
res[,]
#    x y z
# 1  3 a 1
# 2  1 q 2
# 3 10 b 3
# 4  8 a 6
# 5  6 y 7


################################################
################# cbind_ffdf ###################
################################################


# cbind_ffdf(x=,y=)

# will bind columns of y to columns of x .
# obs: if not cloned, the binded cols will still be the cols of y.
# and then if saved without cloning, y will loose all its columns.

# example :

data_2 <-  ffdf( x2 = ff(sample(10:19)), y2 = ff(factor(sample(LETTERS, 10, replace = T))), z2 = ff(10:1))
data_2[1:3,]
#   x2 y2 z2
# 1 19  L 10
# 2 11  Y  9
# 3 12  W  8

res <- cbind_ffdf(data,data_2)  # will bind the columns of data_2 to the cols of data.

res[1:5,]
#    x y z x2 y2 z2
# 1  3 a 1 19  L 10
# 2  1 q 2 11  Y  9
# 3 10 b 3 12  W  8
# 4  9 f 4 13  E  7
# 5  2 o 5 14  G  6

filename(res$x2)
filename(data_2$x2)  # they have the same name, so we need to clone if we do not want to move those columns.
res <- clone(res)
filename(res$x2)   # a different filename. Ready to save without moving data from data or data_2


################################################
################## append_ffdf  ################
################################################


append_ffdf(x=,y_=,chunk_s=)

# will append two ffdf using chunks from the second ffdf.
# used mostly when ffdfappend did not append two ffdf but only one ffdf with one data.frame.
# now, we prefer the use of ffdfappend, which is eqivalent to this one.
# THis function can be useful when memory conditions get difficult.
# It has also the feature to display info about the total estimated time, remaining, etc.

# example:
data_3  <- ffdf( x = ff(sample(1:10)), y = ff(factor(sample(letters, 10, replace = T))), z = ff(20:29))

res <- append_ffdf(data,data_3,4)

# [1] " done split  1 / 3 , row_end  4  spent :  0 , E. total  0 , E. remain   0"
# [1] " done split  2 / 3 , row_end  8  spent :  0 , E. total  0 , E. remain   0"
# [1] " done split  3 / 3 , row_end  10  spent :  0 , E. total  0 , E. remain   0"
# [1] " Total time : 0.1"
dim(res) # 20  3

# OBS: as with ffdfappend, we are adding data to the first ffdf. So the result will be the same filenames as the first ffdf:

filename(res$x) == filename(data$x) # TRUE.

# if we want to get a new ffdf, we must clone the first ffdf:

res <- append_ffdf(clone(data),data_3,4)
filename(res$x) == filename(data$x) # FALSE. Now we can save without moving files from "data".
################################################


################################################
############### merge_by #######################
################################################

# merge_by(x=,y=,keycols=,split_name=,chk=)

# merge_by is used to merge two ffdf, in an efficient way. Data from the first ffdf must be ordered by the split variable.
# the merge is done over "keycolumns" and only the data that have the same combination of keycolumns values will be retained.
# This is a merge as in SQL in the code : WHERE t1.ident = t2.ident , etc.
# merge_by is using a merge with data.table in this way:

# merged <- y[x, nomatch = 0]

# and then the "no matching" records are NOT included in the result.

# example:
N <- 1e5
n <- N/2
dat  <- ffdf( x = ff(sample(1:N)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))
dat2  <- ffdf( x = ff(sample(1:n)), y2= ff(factor(sample(LETTERS, n, replace = T))))

dat[1:5,]

#       x y z
# 1  7167 t 1
# 2 28861 x 2
# 3 60652 z 3
# 4 82511 z 4
# 5 73610 p 5

dat2[1:5,]

#       x y2
# 1 29099  R
# 2 22323  O
# 3 26713  K
# 4  3909  A
# 5 23463  N

keycols <- "x"
split_col  <- "x"

# data MUST be ordered, the first one:
dat <- order_ffdf(dat, "x")

res <- merge_by(dat, dat2, keycols,split_col,N/3)
res[1:5,]
#   x y2 y     z
# 1 1  O q 94492
# 2 2  C d 99832
# 3 3  F o 28827
# 4 4  K b 19108
# 5 5  C n 28736
dim(res); dim(dat); dim(dat2)
################################################

################################################
############## index_ffdf ######################
################################################


# index_ffdf(x=,idx=,verbose=)

# function to index a ffdf .
# idx is a ff vector with the indexes.
# ffdforder gives memory problems because of the use of ffdfget_columnwise.
# to avoid those problems, we can use index_ffdf or for larger data: index_ffdf_split.

# example:
N <- 1e6
n <- N/10
dat  <- ffdf( x = ff(sample(1:n, N, replace = T)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))

idx <- ffdforder(dat[c("x","z")])

res <- index_ffdf(dat, idx, T)
res[1:10,]
#    x y      z
# 1  1 l 134639
# 2  1 l 182189
# 3  1 c 196719
# 4  1 f 267096
# 5  1 r 353421
# 6  1 m 454636
# 7  1 c 455921
# 8  1 b 585642
# 9  1 e 587494
# 10 1 m 661377

idx <- ff(sample(1:1e5))  # take 1e5 random indexes.
res <- index_ffdf(dat, idx, T)  # get the database of 1e5 random rows.
res[1:5,]
#       x y     z
# 1 12198 q  8479
# 2 81870 x 82379
# 3 43026 z 51030
# 4 86046 p 29775
# 5  8335 v 37094

################################################


################################################
#################### index_ffdf_split ##########
################################################


# index_ffdf_split(x=,idx=,nsplits=,verbose=)

# function to index a ffdf .
# idx is a ff vector with the indexes.
# ffdforder gives memory problems because of the use of ffdfget_columnwise.
# to avoid those problems, we can use index_ffdf or for larger data: index_ffdf_split.
# using several splits we get into memory smaller blocks of data.
# this allows to index huge data bases with no memory overflows...


# example:
N <- 1e7  # 10 million rows
n <- N/10
dat  <- ffdf( x = ff(sample(1:n, N, replace = T)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))

idx <- ffdforder(dat[c("x","z")])

res <- index_ffdf_split(dat, idx,3, T)

res[1:10,]
#    x y       z
# 1  1 b 1045064
# 2  1 x 4055519
# 3  1 x 4936608
# 4  1 l 6032727
# 5  1 k 6798082
# 6  1 k 6903526

################################################


################################################
################# apply_ffdf_split #############
################################################

# apply_ffdf_split(x=,fu=,nsplits=1,verbose=F)

# will apply a function fu to a ffdf.
# avoids memory overflow by splitting the computation in several parts.
# fu must give a result as a data.frame, and operates as in "R" objects.
# does not need any ordering in the ffdf.

# example:
N <- 1e7  # 10 million rows
n <- N/10
dat  <- ffdf( x = ff(sample(1:n, N, replace = T)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))

sum10 <-  function(x){ data.frame( z10 = 10 + x$z)}

res <- apply_ffdf_split(dat,sum10, 4, T)

res[1:10,]
class(res)
# [1] 11 12 13 14 15 16 17 18 19 20
# "ffdf"

################################################


################################################
################# apply_ffdf_chunks ############
################################################

# apply_ffdf_chunks(data_=,fu=,chunk_size=,verbose=)

# will apply a function to a ffdf.
# avoids memory overflow by splitting the computation in several parts of size chunk_size.
# does not need any ordering in the ffdf.

# example:
N <- 1e7  # 10 million rows
n <- N/10
dat  <- ffdf( x = ff(sample(1:n, N, replace = T)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))

values <-  function(x){ data.frame( values = x$z + x$x, rest = x$z - x$x/2. ) }

res <- apply_ffdf_chunks(dat,values, 2e6, T)


res[1:10,]
#    values rest
# 1  482437    1
# 2   21008    4
# 3  487840    9
# 4  235405   16
# 5  693192   25
# 6   49780   36
# 7  873622   49
# 8  875988   64
# 9   23761   81
# 10 514730  100


################################################
################# split_apply_cpp ##############
################################################


#split_apply_cpp(input_data=,split_vector=,chk_size_=,fu=)

# split_apply_cpp will take a ffdf which MUST be ordered by the split vector, and will compute the function fu over the splits of size chk_size, RESPECTING the same split_vector
# values. That means that when making splits, we will not break the data where same values of the split vector are. Same values of the split vector will be in one single split.
# This allows to use this function with some group_by functions, over large data sets.

# function fu must operate over the database input_data as a data.frame (when it is loaded in memory, not as ffdf).

N <- 1e7  # 10 million rows
n <- N/10
dat  <- ffdf( x = ff(sample(1:n, N, replace = T)), y = ff(factor(sample(letters, N, replace = T))), z = ff(1:N))

# order the data by the split vector !!!!
res <- order_ffdf(dat,"x")
res[1:5,]

#   x y       z
# 1 1 f  179992
# 2 1 b  938613
# 3 1 d 3160421
# 4 1 z 6188453
# 5 1 d 7054255

fun.900000 <- function(x){

  zz <- x$z
  res <- ifelse(zz > 900000, 1, 0)
  res <- data.frame(g.than.900000 = res)
  res # we return a data.frame.
}

# test to apply over a sample:
df <- dat[1:10,]; fun.900000(df) # ok.

# apply to all the data:

chuk_size = 1e6
res <- split_apply_cpp(res,res$x,chuk_size,fun.900000) # 20 seconds for 10 million rows.


################################################

##########################################################
#        SOME UTILITIES     ##############################
##########################################################


################################################
###### REDIR_FFDF ##############################
################################################

# redir_ffdf(ffdf=,newdir= )

# will redefine a directory for a ffdf database which was saved in a directory and moved manually to the present one.
# This function will allow to avoid the error filename == 0 , allowing the use of the ffdf again.
# This function works for data saved with save.ffdf within the SAME directory name as the name of the database.


# example:
setwd(dirdata)
data_redir <- ffdf( one = ff(1:10), two = ff(11:20))
save.ffdf(data_redir, dir = "data_redir")  # OBS: saved data to a directory named SAME as the name of the database.

# now suppose that we move manually (not using R) the files into the directory "P:/Projects/RLargeData/"
# to ba able to work with the data again, we will do this:

setwd(dirdata.2)  # change the directory.
load.ffdf("data_redir")           # load the database
data.p <- redir_ffdf(data, dirdata.2)  # change the path names of the files to correct for the move.
data.p  # now we can use it without access errors.

################################################


################################################
###### redir_ffdf_ffdfsave  ####################
################################################


#redir_ffdf_ffdfsave(ff=,newdir=)

# will redefine a directory for a ffdf database which was saved in another directory and moved manually to the present one.
# This function will allow to avoid the error filename == 0 , and we will be able to use the ffdf again.
# This function works for data saved with ffdfsave with the SAME file name as the name of the database

# example:

setwd(dirdata)
data_r <- ffdf( one = ff(1:10), two = ff(11:20))
ffdfsave(data_r, file = "data_r")   # saved data to a file named SAME as the name of the database.

# now suppose that we move manually (not using R) the files into the directory "P:/Projects/RLargeData/"
# to be able to work with the data again, we will do this:

dirP <- " " # make yours ...

setwd(dirP)  # change the directory.
load("data_r")           # load the database
data.p <- redir_ffdf_ffdfsave(data_r, dirP)  # change the path names of the files to correct for the move.
data.p  # now we can use it without access errors.
################################################


################################################
############## is_ordered_by_row_col ###########
################################################


# is_ordered_by_row_col(data=,col_name=)

# will give TRUE if in a ffdf database, a column named col_name is like the row numbers.
# That column MUST be in the form 1,2,3,4,5,.... N , as if it was the row indexes in order.

# example:

data <- ffdf( x = ff(1:100), y = ff(sample(1:2354, 100) ) ) # make a ffdf database.
data[1:10,]

is_ordered_by_row_col(data,"x") # TRUE
is_ordered_by_row_col(data,"y") # FALSE

# this function is used frequently when making split-apply-combine using data_table and keeping a "reference" column as the row number.
# The function will verify if we need to reorder the result to make a new column as the result.

################################################


################################################
########## quote.names , qn  ###################
################################################


quote_names()                       # will make a vector of the names that appear in the expression. Very useful.
qn()                                # abbreviation of the same function.

# example:

quote_names(ident,oyo,pea, fre, wer, q_32rer)
# gives:
# "ident"   "oyo"     "pea"     "fre"     "wer"     "q_32rer"

qn(ident,region,ale) #  "ident"  "region" "ale"

# application for data:
data <- data.frame( x= 1:10, y = letters[1:10] , ww = LETTERS[1:10], zz = letters[10:1], aa = 20:11)
data[1:5,]

sub <- data[qn(x,y,zz)]  # we use qn(x,y,zz) to specify the column names... same as c("x","y","zz") which is longer to write down.
sub[1:5,]

################################################