Initial commit — Coursera Big Data coursework

This commit is contained in:
2026-05-09 03:03:14 +00:00
commit c85c238e41
30 changed files with 2488 additions and 0 deletions
+17
View File
@@ -0,0 +1,17 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<startup>
<supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
</startup>
<runtime>
<assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
<dependentAssembly>
<assemblyIdentity name="FSharp.Core" publicKeyToken="b03f5f7f11d50a3a" culture="neutral"/>
<bindingRedirect oldVersion="4.0.0.0" newVersion="4.3.0.0"/>
<bindingRedirect oldVersion="2.3.5.0" newVersion="4.3.0.0"/>
<bindingRedirect oldVersion="2.0.0.0" newVersion="4.3.0.0"/>
</dependentAssembly>
</assemblyBinding>
</runtime>
</configuration>
+69
View File
@@ -0,0 +1,69 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
<PropertyGroup>
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
<SchemaVersion>2.0</SchemaVersion>
<ProjectGuid>1b4ca350-550f-4533-90f4-eb65ba6a7b8b</ProjectGuid>
<OutputType>Exe</OutputType>
<RootNamespace>AuthorTerms</RootNamespace>
<AssemblyName>AuthorTerms</AssemblyName>
<TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
<Name>AuthorTerms</Name>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
<DebugSymbols>true</DebugSymbols>
<DebugType>full</DebugType>
<Optimize>false</Optimize>
<Tailcalls>false</Tailcalls>
<OutputPath>bin\Debug\</OutputPath>
<DefineConstants>DEBUG;TRACE</DefineConstants>
<WarningLevel>3</WarningLevel>
<PlatformTarget>AnyCPU</PlatformTarget>
<DocumentationFile>bin\Debug\AuthorTerms.XML</DocumentationFile>
<Prefer32Bit>true</Prefer32Bit>
<StartArguments>Dewitt</StartArguments>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<DebugType>pdbonly</DebugType>
<Optimize>true</Optimize>
<Tailcalls>true</Tailcalls>
<OutputPath>bin\Release\</OutputPath>
<DefineConstants>TRACE</DefineConstants>
<WarningLevel>3</WarningLevel>
<PlatformTarget>AnyCPU</PlatformTarget>
<DocumentationFile>bin\Release\AuthorTerms.XML</DocumentationFile>
<Prefer32Bit>true</Prefer32Bit>
<StartArguments>Henzinger</StartArguments>
</PropertyGroup>
<ItemGroup>
<Reference Include="mscorlib" />
<Reference Include="FSharp.Core, Version=4.3.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
<Private>True</Private>
</Reference>
<Reference Include="System" />
<Reference Include="System.Core" />
<Reference Include="System.Numerics" />
</ItemGroup>
<ItemGroup>
<Content Include="stopwords.py">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
<Compile Include="data.fs" />
<Compile Include="MapReduce.fs" />
<Compile Include="Program.fs" />
<None Include="App.config" />
</ItemGroup>
<PropertyGroup>
<MinimumVisualStudioVersion Condition="'$(MinimumVisualStudioVersion)' == ''">11</MinimumVisualStudioVersion>
</PropertyGroup>
<Import Project="$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets" Condition=" Exists('$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets')" />
<!-- To modify your build process, add your task inside one of the targets below and uncomment it.
Other similar extension points exist, see Microsoft.Common.targets.
<Target Name="BeforeBuild">
</Target>
<Target Name="AfterBuild">
</Target>
-->
</Project>
+20
View File
@@ -0,0 +1,20 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2012
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "AuthorTerms", "AuthorTerms.fsproj", "{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{1B4CA350-550F-4533-90F4-EB65BA6A7B8B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
EndGlobal
+39
View File
@@ -0,0 +1,39 @@
module MapReduce
//let map_reduce
// // Map function take pair and create sequence of key/value pairs
// (m:'k1 -> 'v1 -> seq<'k2 * 'v2>)
// // Reduce function takes key and sequence to produce optional value
// (r:'k2 -> seq<'v2> -> 'v3)
// // Takes an input of key/value pairs to produce an output key/value pairs
// : Map<'k1, 'v1> -> Map<'k2, 'v3> =
//
// let map_per_key : Map<'k1, 'v1> -> seq<('k2 * 'v2)> =
// Map.toSeq >> // 1. Map into a sequence
// Seq.map (fun (k, v) -> m k v) >> // 2. Map m over a list of pairs
// Seq.concat // 3. Concat per-key lists
//
// let group_by_key (l:seq<('k2 * 'v2)>) : Map<'k2,seq<'v2>> =
// l
// |> Seq.groupBy fst
// |> Seq.map (fun(k,vs) -> k, Seq.map snd vs)
// |> Map.ofSeq
//
// let reduce_per_key : Map<'k2, seq<'v2>> -> Map<'k2,'v3> =
// let un_some k (Some v) = v // Remove optional type
// let is_some k = function
// | Some _ -> true // Keep entires
// | None -> false // Remove entries
// Map.map r //>> // 1. Apply reduce per key
// //Map.filter is_some >> // 2. Remove None entries
// //Map.map un_some // 3. Transform to remove option
//
// map_per_key >> // 1. Apply map function to each key/value pair
// group_by_key >> // 2. Group intermediate data per key
// reduce_per_key // 3. Apply reduce to each group
let map_reduce map reduce (inputs:seq<_*_>) =
let intermediates = inputs |> Seq.map map |> Seq.concat
let groupings = intermediates |> Seq.groupBy fst |> Seq.map (fun(x,y) -> x, Seq.map snd y)
let results = groupings |> Seq.map reduce
results
+47
View File
@@ -0,0 +1,47 @@
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
open data
open MapReduce
open System.Text.RegularExpressions
[<EntryPoint>]
let main argv =
let stopwords = getStopwords
let booksMap =
let bookData = (getData @"..\..\data")
let bookToTuple book =
book.authors |> List.map (fun a -> a, book.title)
let booksToMap books =
books |> Seq.collect bookToTuple
bookData |> booksToMap
let wordsRegex = new Regex("(?<word>\w{2,})", RegexOptions.Compiled)
let mapfunc (author:string, title) =
let words =
wordsRegex.Matches(title)
|> Seq.cast<Match>
|> Seq.map (fun m -> m.Groups.["word"].Value.ToLower())
|> Seq.filter (fun w -> not (Seq.exists ((=) w) stopwords))
[ author.ToLower() ,words ] |> Seq.ofList
let reducefunc (author, words: seq<seq<string>>) =
//let bw = words |> Seq.filter (fun wl -> wl |> Seq.length > 1) |> Array.ofSeq
//printfn "%A" bw
let countedWords =
words
|> Seq.collect (fun s -> s)
|> Seq.groupBy(fun w -> w)
|> Seq.map (fun (w,l) -> w, Seq.length l)
|> Seq.sortBy (fun (_,c) -> -c - 1)
author, countedWords
let r = map_reduce mapfunc reducefunc booksMap
//printfn "Map Length %A\n" (r |> Map.toSeq |> Seq.length)
//printfn "%A" (r |> Map.toArray)
//let result = (r |> Seq.filter (fun k v -> v |> Seq.exists (fun (w,c) -> c > 3)))
let result = r |> Seq.filter (fun (k:string,_) -> k.Contains(argv.[0].ToLower()))
printfn "%A\n" result
0 // return an integer exit code
+31
View File
@@ -0,0 +1,31 @@
module data
open System.IO
open System.Text.RegularExpressions
let getStopwords =
let text = File.ReadAllText(@"stopwords.py")
Regex.Matches(text, @"'(?<stopword>\w*)'")
|> Seq.cast<Match>
|> Seq.map (fun m -> m.Groups.["stopword"].Value)
type Book = {
id : string;
authors : string List;
title : string;
}
let getData directory =
let parseFile filename =
File.ReadAllLines(filename)
|> Seq.map (fun l ->
Regex.Split(l,":::")
|> fun arr ->
{
id = arr.[0];
authors = Regex.Split(arr.[1],"::") |> List.ofArray;
title = arr.[2]
}
)
let files = Directory.GetFiles(directory)
files |> Seq.collect parseFile
Binary file not shown.
+1
View File
@@ -0,0 +1 @@
allStopWords={'about':1, 'above':1, 'after':1, 'again':1, 'against':1, 'all':1, 'am':1, 'an':1, 'and':1, 'any':1, 'are':1, 'arent':1, 'as':1, 'at':1, 'be':1, 'because':1, 'been':1, 'before':1, 'being':1, 'below':1, 'between':1, 'both':1, 'but':1, 'by':1, 'cant':1, 'cannot':1, 'could':1, 'couldnt':1, 'did':1, 'didnt':1, 'do':1, 'does':1, 'doesnt':1, 'doing':1, 'dont':1, 'down':1, 'during':1, 'each':1, 'few':1, 'for':1, 'from':1, 'further':1, 'had':1, 'hadnt':1, 'has':1, 'hasnt':1, 'have':1, 'havent':1, 'having':1, 'he':1, 'hed':1, 'hell':1, 'hes':1, 'her':1, 'here':1, 'heres':1, 'hers':1, 'herself':1, 'him':1, 'himself':1, 'his':1, 'how':1, 'hows':1, 'i':1, 'id':1, 'ill':1, 'im':1, 'ive':1, 'if':1, 'in':1, 'into':1, 'is':1, 'isnt':1, 'it':1, 'its':1, 'its':1, 'itself':1, 'lets':1, 'me':1, 'more':1, 'most':1, 'mustnt':1, 'my':1, 'myself':1, 'no':1, 'nor':1, 'not':1, 'of':1, 'off':1, 'on':1, 'once':1, 'only':1, 'or':1, 'other':1, 'ought':1, 'our':1, 'ours ':1, 'ourselves':1, 'out':1, 'over':1, 'own':1, 'same':1, 'shant':1, 'she':1, 'shed':1, 'shell':1, 'shes':1, 'should':1, 'shouldnt':1, 'so':1, 'some':1, 'such':1, 'than':1, 'that':1, 'thats':1, 'the':1, 'their':1, 'theirs':1, 'them':1, 'themselves':1, 'then':1, 'there':1, 'theres':1, 'these':1, 'they':1, 'theyd':1, 'theyll':1, 'theyre':1, 'theyve':1, 'this':1, 'those':1, 'through':1, 'to':1, 'too':1, 'under':1, 'until':1, 'up':1, 'very':1, 'was':1, 'wasnt':1, 'we':1, 'wed':1, 'well':1, 'were':1, 'weve':1, 'were':1, 'werent':1, 'what':1, 'whats':1, 'when':1, 'whens':1, 'where':1, 'wheres':1, 'which':1, 'while':1, 'who':1, 'whos':1, 'whom':1, 'why':1, 'whys':1, 'with':1, 'wont':1, 'would':1, 'wouldnt':1, 'you':1, 'youd':1, 'youll':1, 'youre':1, 'youve':1, 'your':1, 'yours':1, 'yourself':1, 'yourselves':1}