Initial commit — Coursera Big Data coursework
This commit is contained in:
@@ -0,0 +1,17 @@
|
||||
<?xml version="1.0" encoding="utf-8" ?>
|
||||
<configuration>
|
||||
<startup>
|
||||
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
|
||||
</startup>
|
||||
<runtime>
|
||||
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
|
||||
<dependentAssembly>
|
||||
<assemblyIdentity name="FSharp.Core" publicKeyToken="b03f5f7f11d50a3a" culture="neutral"/>
|
||||
<bindingRedirect oldVersion="4.0.0.0" newVersion="4.3.0.0"/>
|
||||
<bindingRedirect oldVersion="2.3.5.0" newVersion="4.3.0.0"/>
|
||||
<bindingRedirect oldVersion="2.0.0.0" newVersion="4.3.0.0"/>
|
||||
|
||||
</dependentAssembly>
|
||||
</assemblyBinding>
|
||||
</runtime>
|
||||
</configuration>
|
||||
@@ -0,0 +1,69 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
|
||||
<PropertyGroup>
|
||||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
|
||||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
|
||||
<SchemaVersion>2.0</SchemaVersion>
|
||||
<ProjectGuid>1b4ca350-550f-4533-90f4-eb65ba6a7b8b</ProjectGuid>
|
||||
<OutputType>Exe</OutputType>
|
||||
<RootNamespace>AuthorTerms</RootNamespace>
|
||||
<AssemblyName>AuthorTerms</AssemblyName>
|
||||
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
|
||||
<Name>AuthorTerms</Name>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
|
||||
<DebugSymbols>true</DebugSymbols>
|
||||
<DebugType>full</DebugType>
|
||||
<Optimize>false</Optimize>
|
||||
<Tailcalls>false</Tailcalls>
|
||||
<OutputPath>bin\Debug\</OutputPath>
|
||||
<DefineConstants>DEBUG;TRACE</DefineConstants>
|
||||
<WarningLevel>3</WarningLevel>
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
<DocumentationFile>bin\Debug\AuthorTerms.XML</DocumentationFile>
|
||||
<Prefer32Bit>true</Prefer32Bit>
|
||||
<StartArguments>Dewitt</StartArguments>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
|
||||
<DebugType>pdbonly</DebugType>
|
||||
<Optimize>true</Optimize>
|
||||
<Tailcalls>true</Tailcalls>
|
||||
<OutputPath>bin\Release\</OutputPath>
|
||||
<DefineConstants>TRACE</DefineConstants>
|
||||
<WarningLevel>3</WarningLevel>
|
||||
<PlatformTarget>AnyCPU</PlatformTarget>
|
||||
<DocumentationFile>bin\Release\AuthorTerms.XML</DocumentationFile>
|
||||
<Prefer32Bit>true</Prefer32Bit>
|
||||
<StartArguments>Henzinger</StartArguments>
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="mscorlib" />
|
||||
<Reference Include="FSharp.Core, Version=4.3.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
|
||||
<Private>True</Private>
|
||||
</Reference>
|
||||
<Reference Include="System" />
|
||||
<Reference Include="System.Core" />
|
||||
<Reference Include="System.Numerics" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Content Include="stopwords.py">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
<Compile Include="data.fs" />
|
||||
<Compile Include="MapReduce.fs" />
|
||||
<Compile Include="Program.fs" />
|
||||
<None Include="App.config" />
|
||||
</ItemGroup>
|
||||
<PropertyGroup>
|
||||
<MinimumVisualStudioVersion Condition="'$(MinimumVisualStudioVersion)' == ''">11</MinimumVisualStudioVersion>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets" Condition=" Exists('$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets')" />
|
||||
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
|
||||
Other similar extension points exist, see Microsoft.Common.targets.
|
||||
<Target Name="BeforeBuild">
|
||||
</Target>
|
||||
<Target Name="AfterBuild">
|
||||
</Target>
|
||||
-->
|
||||
</Project>
|
||||
@@ -0,0 +1,20 @@
|
||||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 2012
|
||||
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "AuthorTerms", "AuthorTerms.fsproj", "{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
EndGlobal
|
||||
@@ -0,0 +1,39 @@
|
||||
module MapReduce
|
||||
|
||||
//let map_reduce
|
||||
// // Map function take pair and create sequence of key/value pairs
|
||||
// (m:'k1 -> 'v1 -> seq<'k2 * 'v2>)
|
||||
// // Reduce function takes key and sequence to produce optional value
|
||||
// (r:'k2 -> seq<'v2> -> 'v3)
|
||||
// // Takes an input of key/value pairs to produce an output key/value pairs
|
||||
// : Map<'k1, 'v1> -> Map<'k2, 'v3> =
|
||||
//
|
||||
// let map_per_key : Map<'k1, 'v1> -> seq<('k2 * 'v2)> =
|
||||
// Map.toSeq >> // 1. Map into a sequence
|
||||
// Seq.map (fun (k, v) -> m k v) >> // 2. Map m over a list of pairs
|
||||
// Seq.concat // 3. Concat per-key lists
|
||||
//
|
||||
// let group_by_key (l:seq<('k2 * 'v2)>) : Map<'k2,seq<'v2>> =
|
||||
// l
|
||||
// |> Seq.groupBy fst
|
||||
// |> Seq.map (fun(k,vs) -> k, Seq.map snd vs)
|
||||
// |> Map.ofSeq
|
||||
//
|
||||
// let reduce_per_key : Map<'k2, seq<'v2>> -> Map<'k2,'v3> =
|
||||
// let un_some k (Some v) = v // Remove optional type
|
||||
// let is_some k = function
|
||||
// | Some _ -> true // Keep entires
|
||||
// | None -> false // Remove entries
|
||||
// Map.map r //>> // 1. Apply reduce per key
|
||||
// //Map.filter is_some >> // 2. Remove None entries
|
||||
// //Map.map un_some // 3. Transform to remove option
|
||||
//
|
||||
// map_per_key >> // 1. Apply map function to each key/value pair
|
||||
// group_by_key >> // 2. Group intermediate data per key
|
||||
// reduce_per_key // 3. Apply reduce to each group
|
||||
|
||||
let map_reduce map reduce (inputs:seq<_*_>) =
|
||||
let intermediates = inputs |> Seq.map map |> Seq.concat
|
||||
let groupings = intermediates |> Seq.groupBy fst |> Seq.map (fun(x,y) -> x, Seq.map snd y)
|
||||
let results = groupings |> Seq.map reduce
|
||||
results
|
||||
@@ -0,0 +1,47 @@
|
||||
// Learn more about F# at http://fsharp.net
|
||||
// See the 'F# Tutorial' project for more help.
|
||||
|
||||
open data
|
||||
open MapReduce
|
||||
open System.Text.RegularExpressions
|
||||
|
||||
[<EntryPoint>]
|
||||
let main argv =
|
||||
let stopwords = getStopwords
|
||||
|
||||
let booksMap =
|
||||
let bookData = (getData @"..\..\data")
|
||||
let bookToTuple book =
|
||||
book.authors |> List.map (fun a -> a, book.title)
|
||||
let booksToMap books =
|
||||
books |> Seq.collect bookToTuple
|
||||
bookData |> booksToMap
|
||||
let wordsRegex = new Regex("(?<word>\w{2,})", RegexOptions.Compiled)
|
||||
|
||||
let mapfunc (author:string, title) =
|
||||
let words =
|
||||
wordsRegex.Matches(title)
|
||||
|> Seq.cast<Match>
|
||||
|> Seq.map (fun m -> m.Groups.["word"].Value.ToLower())
|
||||
|> Seq.filter (fun w -> not (Seq.exists ((=) w) stopwords))
|
||||
[ author.ToLower() ,words ] |> Seq.ofList
|
||||
|
||||
let reducefunc (author, words: seq<seq<string>>) =
|
||||
//let bw = words |> Seq.filter (fun wl -> wl |> Seq.length > 1) |> Array.ofSeq
|
||||
//printfn "%A" bw
|
||||
let countedWords =
|
||||
words
|
||||
|> Seq.collect (fun s -> s)
|
||||
|> Seq.groupBy(fun w -> w)
|
||||
|> Seq.map (fun (w,l) -> w, Seq.length l)
|
||||
|> Seq.sortBy (fun (_,c) -> -c - 1)
|
||||
author, countedWords
|
||||
|
||||
let r = map_reduce mapfunc reducefunc booksMap
|
||||
//printfn "Map Length %A\n" (r |> Map.toSeq |> Seq.length)
|
||||
//printfn "%A" (r |> Map.toArray)
|
||||
|
||||
//let result = (r |> Seq.filter (fun k v -> v |> Seq.exists (fun (w,c) -> c > 3)))
|
||||
let result = r |> Seq.filter (fun (k:string,_) -> k.Contains(argv.[0].ToLower()))
|
||||
printfn "%A\n" result
|
||||
0 // return an integer exit code
|
||||
@@ -0,0 +1,31 @@
|
||||
module data
|
||||
open System.IO
|
||||
open System.Text.RegularExpressions
|
||||
|
||||
let getStopwords =
|
||||
let text = File.ReadAllText(@"stopwords.py")
|
||||
Regex.Matches(text, @"'(?<stopword>\w*)'")
|
||||
|> Seq.cast<Match>
|
||||
|> Seq.map (fun m -> m.Groups.["stopword"].Value)
|
||||
|
||||
type Book = {
|
||||
id : string;
|
||||
authors : string List;
|
||||
title : string;
|
||||
}
|
||||
|
||||
let getData directory =
|
||||
let parseFile filename =
|
||||
File.ReadAllLines(filename)
|
||||
|> Seq.map (fun l ->
|
||||
Regex.Split(l,":::")
|
||||
|> fun arr ->
|
||||
{
|
||||
id = arr.[0];
|
||||
authors = Regex.Split(arr.[1],"::") |> List.ofArray;
|
||||
title = arr.[2]
|
||||
}
|
||||
)
|
||||
let files = Directory.GetFiles(directory)
|
||||
files |> Seq.collect parseFile
|
||||
|
||||
Binary file not shown.
@@ -0,0 +1 @@
|
||||
allStopWords={'about':1, 'above':1, 'after':1, 'again':1, 'against':1, 'all':1, 'am':1, 'an':1, 'and':1, 'any':1, 'are':1, 'arent':1, 'as':1, 'at':1, 'be':1, 'because':1, 'been':1, 'before':1, 'being':1, 'below':1, 'between':1, 'both':1, 'but':1, 'by':1, 'cant':1, 'cannot':1, 'could':1, 'couldnt':1, 'did':1, 'didnt':1, 'do':1, 'does':1, 'doesnt':1, 'doing':1, 'dont':1, 'down':1, 'during':1, 'each':1, 'few':1, 'for':1, 'from':1, 'further':1, 'had':1, 'hadnt':1, 'has':1, 'hasnt':1, 'have':1, 'havent':1, 'having':1, 'he':1, 'hed':1, 'hell':1, 'hes':1, 'her':1, 'here':1, 'heres':1, 'hers':1, 'herself':1, 'him':1, 'himself':1, 'his':1, 'how':1, 'hows':1, 'i':1, 'id':1, 'ill':1, 'im':1, 'ive':1, 'if':1, 'in':1, 'into':1, 'is':1, 'isnt':1, 'it':1, 'its':1, 'its':1, 'itself':1, 'lets':1, 'me':1, 'more':1, 'most':1, 'mustnt':1, 'my':1, 'myself':1, 'no':1, 'nor':1, 'not':1, 'of':1, 'off':1, 'on':1, 'once':1, 'only':1, 'or':1, 'other':1, 'ought':1, 'our':1, 'ours ':1, 'ourselves':1, 'out':1, 'over':1, 'own':1, 'same':1, 'shant':1, 'she':1, 'shed':1, 'shell':1, 'shes':1, 'should':1, 'shouldnt':1, 'so':1, 'some':1, 'such':1, 'than':1, 'that':1, 'thats':1, 'the':1, 'their':1, 'theirs':1, 'them':1, 'themselves':1, 'then':1, 'there':1, 'theres':1, 'these':1, 'they':1, 'theyd':1, 'theyll':1, 'theyre':1, 'theyve':1, 'this':1, 'those':1, 'through':1, 'to':1, 'too':1, 'under':1, 'until':1, 'up':1, 'very':1, 'was':1, 'wasnt':1, 'we':1, 'wed':1, 'well':1, 'were':1, 'weve':1, 'were':1, 'werent':1, 'what':1, 'whats':1, 'when':1, 'whens':1, 'where':1, 'wheres':1, 'which':1, 'while':1, 'who':1, 'whos':1, 'whom':1, 'why':1, 'whys':1, 'with':1, 'wont':1, 'would':1, 'wouldnt':1, 'you':1, 'youd':1, 'youll':1, 'youre':1, 'youve':1, 'your':1, 'yours':1, 'yourself':1, 'yourselves':1}
|
||||
Reference in New Issue
Block a user