32 lines
807 B
FSharp
32 lines
807 B
FSharp
module data
|
|
open System.IO
|
|
open System.Text.RegularExpressions
|
|
|
|
let getStopwords =
|
|
let text = File.ReadAllText(@"stopwords.py")
|
|
Regex.Matches(text, @"'(?<stopword>\w*)'")
|
|
|> Seq.cast<Match>
|
|
|> Seq.map (fun m -> m.Groups.["stopword"].Value)
|
|
|
|
type Book = {
|
|
id : string;
|
|
authors : string List;
|
|
title : string;
|
|
}
|
|
|
|
let getData directory =
|
|
let parseFile filename =
|
|
File.ReadAllLines(filename)
|
|
|> Seq.map (fun l ->
|
|
Regex.Split(l,":::")
|
|
|> fun arr ->
|
|
{
|
|
id = arr.[0];
|
|
authors = Regex.Split(arr.[1],"::") |> List.ofArray;
|
|
title = arr.[2]
|
|
}
|
|
)
|
|
let files = Directory.GetFiles(directory)
|
|
files |> Seq.collect parseFile
|
|
|