Initial commit — Coursera Big Data coursework
This commit is contained in:
@@ -0,0 +1,47 @@
|
||||
// Learn more about F# at http://fsharp.net
|
||||
// See the 'F# Tutorial' project for more help.
|
||||
|
||||
open data
|
||||
open MapReduce
|
||||
open System.Text.RegularExpressions
|
||||
|
||||
[<EntryPoint>]
|
||||
let main argv =
|
||||
let stopwords = getStopwords
|
||||
|
||||
let booksMap =
|
||||
let bookData = (getData @"..\..\data")
|
||||
let bookToTuple book =
|
||||
book.authors |> List.map (fun a -> a, book.title)
|
||||
let booksToMap books =
|
||||
books |> Seq.collect bookToTuple
|
||||
bookData |> booksToMap
|
||||
let wordsRegex = new Regex("(?<word>\w{2,})", RegexOptions.Compiled)
|
||||
|
||||
let mapfunc (author:string, title) =
|
||||
let words =
|
||||
wordsRegex.Matches(title)
|
||||
|> Seq.cast<Match>
|
||||
|> Seq.map (fun m -> m.Groups.["word"].Value.ToLower())
|
||||
|> Seq.filter (fun w -> not (Seq.exists ((=) w) stopwords))
|
||||
[ author.ToLower() ,words ] |> Seq.ofList
|
||||
|
||||
let reducefunc (author, words: seq<seq<string>>) =
|
||||
//let bw = words |> Seq.filter (fun wl -> wl |> Seq.length > 1) |> Array.ofSeq
|
||||
//printfn "%A" bw
|
||||
let countedWords =
|
||||
words
|
||||
|> Seq.collect (fun s -> s)
|
||||
|> Seq.groupBy(fun w -> w)
|
||||
|> Seq.map (fun (w,l) -> w, Seq.length l)
|
||||
|> Seq.sortBy (fun (_,c) -> -c - 1)
|
||||
author, countedWords
|
||||
|
||||
let r = map_reduce mapfunc reducefunc booksMap
|
||||
//printfn "Map Length %A\n" (r |> Map.toSeq |> Seq.length)
|
||||
//printfn "%A" (r |> Map.toArray)
|
||||
|
||||
//let result = (r |> Seq.filter (fun k v -> v |> Seq.exists (fun (w,c) -> c > 3)))
|
||||
let result = r |> Seq.filter (fun (k:string,_) -> k.Contains(argv.[0].ToLower()))
|
||||
printfn "%A\n" result
|
||||
0 // return an integer exit code
|
||||
Reference in New Issue
Block a user