Initial commit — Coursera Big Data coursework
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
module data
|
||||
open System.IO
|
||||
open System.Text.RegularExpressions
|
||||
|
||||
let getStopwords =
|
||||
let text = File.ReadAllText(@"stopwords.py")
|
||||
Regex.Matches(text, @"'(?<stopword>\w*)'")
|
||||
|> Seq.cast<Match>
|
||||
|> Seq.map (fun m -> m.Groups.["stopword"].Value)
|
||||
|
||||
type Book = {
|
||||
id : string;
|
||||
authors : string List;
|
||||
title : string;
|
||||
}
|
||||
|
||||
let getData directory =
|
||||
let parseFile filename =
|
||||
File.ReadAllLines(filename)
|
||||
|> Seq.map (fun l ->
|
||||
Regex.Split(l,":::")
|
||||
|> fun arr ->
|
||||
{
|
||||
id = arr.[0];
|
||||
authors = Regex.Split(arr.[1],"::") |> List.ofArray;
|
||||
title = arr.[2]
|
||||
}
|
||||
)
|
||||
let files = Directory.GetFiles(directory)
|
||||
files |> Seq.collect parseFile
|
||||
|
||||
Reference in New Issue
Block a user