Machine Learning/kdd sample
Jump to navigation
Jump to search
# get a random subsample of students from the training set use strict; use warnings; use Getopt::Long; use File::Basename; my $numItems=1000; my $method="random"; my $type="students"; my $help=""; GetOptions ('numitems=s' => \$numItems, 'method=s' => \$method, 'type=s' => \$type, 'h' => \$help); my $inputFile=shift(@ARGV); if (not($inputFile)) { $help=1; } my $progname=basename($0); if ($help) { print "This program will sample a tab-separated txt file of students.\n"; print "It can be used to get all examples per student (for a number of students).\n"; print "\n"; print "Basic usage:\n"; print "$progname <input file>\n"; print "\n"; print "Full usage:\n"; print "$progname [-numitems <number of items>] [-method <'random'|'first'>] [-type <'students'>] <input file>\n"; print "\n"; print "Examples:\n"; print "$progname algebra_2008_2009_train.txt\n"; print " by default, will create a sample of 1000 random students (all examples on those students)\n"; print "$progname -numitems 20000 algebra_2008_2009_train.txt\n"; print " create a sample of 20000 random students\n"; print "$progname -type students -method first algebra_2008_2009_train.txt\n"; print " create a sample of the first 1000 students\n"; exit(0); } print "Type '$progname -h' to get the help\n"; my $directory="download"; if (not(-e $directory)) { $directory="."; } my $outputFile="${inputFile}_sample_${numItems}_${method}_${type}.csv"; print "Getting $numItems $method $type, putting in $outputFile\n"; # get the list of possible ids my $sourceIdFile=""; my $idIndex=1; my %names=(); my %sourceIds=(); my @sourceIds=(); if ($type eq "students") { $sourceIdFile="$directory/studentinfo.csv"; if (not (-e $sourceIdFile)) { open INPUT, $inputFile; open OUTPUT, ">$sourceIdFile"; while(defined(my $line = <INPUT>)) { chomp($line); my @values=split("\t",$line); my $id = $values[$idIndex]; if (not(defined($sourceIds{$id}))) { print OUTPUT "$id\n"; } $sourceIds{$id} = 1; } close OUTPUT; close INPUT; @sourceIds = keys %sourceIds; } else { open INPUT, $sourceIdFile; while (defined(my $line=<INPUT>)) { chomp($line); push @sourceIds, $line; } close INPUT; } } # get the list of ids to pull my %idsWanted=(); my $numFound=0; while ($numFound < $numItems) { my $id=1; if ($method eq "first") { $id=shift(@sourceIds); } else { my $index=int(rand(scalar(@sourceIds))); $id=$sourceIds[$index]; # remove that id from the source array splice(@sourceIds,$index,1) } $idsWanted{$id}=1; $numFound++; } print "Pulling $type ids (found " . scalar(keys %idsWanted) . ":\n"; #my @sortedIds=sort(keys(%idsWanted)); print "This could take a while...\n"; # go through the list and pull those lines open INPUT, $inputFile; open OUTPUT, ">$outputFile"; # check first line for header my $line=<INPUT>; chomp($line); if ($line =~ /Student Id/) { print OUTPUT "$line\n"; } else { print "Er...no header...\n$line\n"; my @values=split(/\t/,$line); if ($idsWanted{$values[$idIndex]}) { print OUTPUT "$line\n"; } } my $lineNum=1; # now go through the rest of the lines while (defined(my $line=<INPUT>)) { chomp($line); my @values=split(/\t/,$line); if ($idsWanted{$values[$idIndex]}) { print OUTPUT "$line\n"; } if ($lineNum % 100000 == 0) { my $percent=100 * $lineNum/8918055; print "...line $lineNum ($percent %): " . $values[1] . "\n"; } $lineNum++; } close OUTPUT; close INPUT; # Do the same for the test file my $test_input_file = $inputFile; $test_input_file =~ s/train/test/; my $output_test_file = "${test_input_file}_sample_${numItems}_${method}_${type}.csv"; open INPUT, $test_input_file; open OUTPUT, ">$output_test_file"; # check first line for header $line=<INPUT>; chomp($line); if ($line =~ /Student Id/) { print OUTPUT "$line\n"; } else { print "Er...no header...\n$line\n"; my @values=split(/\t/,$line); if ($idsWanted{$values[$idIndex]}) { print OUTPUT "$line\n"; } } $lineNum=1; # now go through the rest of the lines while (defined(my $line=<INPUT>)) { chomp($line); my @values=split(/\t/,$line); if ($idsWanted{$values[$idIndex]}) { print OUTPUT "$line\n"; } if ($lineNum % 100000 == 0) { my $percent=100 * $lineNum/508913; print "...line $lineNum ($percent %): " . $values[1] . "\n"; } $lineNum++; } close OUTPUT; close INPUT; exit(0);