Initial commit — Coursera Big Data coursework

This commit is contained in:
2026-05-09 03:03:14 +00:00
commit c85c238e41
30 changed files with 2488 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
module data
open System.IO
open System.Text.RegularExpressions
let getStopwords =
let text = File.ReadAllText(@"stopwords.py")
Regex.Matches(text, @"'(?<stopword>\w*)'")
|> Seq.cast<Match>
|> Seq.map (fun m -> m.Groups.["stopword"].Value)
type Book = {
id : string;
authors : string List;
title : string;
}
let getData directory =
let parseFile filename =
File.ReadAllLines(filename)
|> Seq.map (fun l ->
Regex.Split(l,":::")
|> fun arr ->
{
id = arr.[0];
authors = Regex.Split(arr.[1],"::") |> List.ofArray;
title = arr.[2]
}
)
let files = Directory.GetFiles(directory)
files |> Seq.collect parseFile