http://www.csie.ntu.edu.tw/~r95007/thesis/svdnetflix/report/report.pdfhttp://eecs.wsu.edu/~vjakkula/MLProject.pdfhttp://michielvanwezel.com/papers/kagie_vdloos_vwezelV2.pdfhttp://cseweb.ucsd.edu/users/elkan/KddNetflixWorkshop.pdfhttp://www.cs.uic.edu/~liub/KDD-cup-2007/proceedings/The-Netflix-Prize-Bennett.pdf准备数据集1shell 将所有测试数据集文件合并为一个文件#!/bin/bashfor x in netflix/training_set/mv_*.txt ; do cat $x >> ratings.txt ;done &http://www.netflixprize.com/community/viewtopic.php?id=87需要下载path模块#!/usr/bin/env pythonimport sysimport csvfrom path import pathNULL = '\N'class Dialect(csv.excel): delimiter = '\t' lineterminator = '\n' doublequote = False escapechar = None quoting = csv.QUOTE_MINIMALdef csvDump(iter_rows_func, basename, dir='.', csvdir='csv', dialect=Dialect): dir,csvdir = path(dir),path(csvdir) if not csvdir.exists(): csvdir.mkdir() inpath = dir/basename outfile = csvdir/inpath.namebase + '.csv' if not outfile.exists(): write = csv.writer(open(outfile, 'wb'), dialect).writerow print >> sys.stderr, 'Writing %s ...' % outfile for row in iter_rows_func(inpath): write(row)def iterMovieRows(path): for line in open(path): id,year,title = line.rstrip().split(',',2) year = year!='NULL' and int(year) or NULL yield (int(id), year, title)def iterTrainingSetRows(dir): for path in dir.walkfiles(): iterlines = (line.strip() for line in open(path)) movie_id = int(iterlines.next()[:-1]) for line in iterlines: user_id,rating,date = line.split(',',2) yield (movie_id, int(user_id), date, float(rating))def iterProbeSetRows(path): for line in (line.strip() for line in open(path)): try: user_id = int(line) except ValueError: movie_id = int(line[:-1]) else: yield (movie_id,user_id)def iterQualifyingSetRows(path): for line in (line.strip() for line in open(path)): try: user_id,date = line.split(',') except ValueError: movie_id = int(line[:-1]) else: yield (movie_id,user_id,date)if __name__ == '__main__': kwds = {} if len(sys.argv) > 1: kwds['dir'] = sys.argv[1] if len(sys.argv) > 2: kwds['csvdir'] = sys.argv[2] for iterfunc, basename in [ (iterMovieRows, 'movie_titles.txt'), (iterTrainingSetRows, 'training_set'), (iterProbeSetRows, 'probe.txt'), (iterQualifyingSetRows, 'qualifying.txt')]: csvDump(iterfunc, basename, **kwds) perl脚本 #!/usr/bin/perluse strict;my $dir = '/path/to/your/training_set';opendir DIR, $dir or die("could not open $dir");while(my $fname = readdir DIR) { my $fname = "$dir/$fname"; open FILE, $fname or die("could not open $fname"); (my $mid = <FILE>) =~ s/:.*//s; while(<FILE>) { chomp; print qq("$mid",); map { print qq("$_",) } split /,/; print "\n"; } close FILE;}closedir DIR;exit;$ time ./bigcsv.pl > bigcsv.csvreal 35m11.521suser 10m36.272ssys 4m9.940smysql> LOAD DATA INFILE 'bigcsv.csv' INTO TABLE main FIELDS TERMINATED BY ',' ENCLOSED BY '"' LINES TERMINATED BY '\n';Query OK, 100480507 rows affected (5 min 34.39 sec)Records: 100480507 Deleted: 0 Skipped: 0 Warnings: 0