Created
October 5, 2023 17:42
-
-
Save ImaginaryDevelopment/d0e4e51f94f02d9fa142ff7462d11f14 to your computer and use it in GitHub Desktop.
some sample code used to scrape data from one set based on another
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// walk two sets of data remove overlap | |
let toLower (x:string) = x.ToLowerInvariant() | |
let afterLast (delimiter:string) (value:string) = | |
value[value.LastIndexOf(delimiter) + 1 ..] | |
let photos = File.ReadAllLines(@"C:\Users\B\Documents\lancephotos.csv") |> Array.skip 2 |> Array.map (fun v -> v.Trim '"' |> toLower) |> Array.truncate 1_000 | |
let students = File.ReadAllLines(@"C:\Users\B\Documents\lancestudents.csv") |> Array.skip 2 |> Array.map (fun v -> v.Trim '"' |> toLower |> afterLast "\\") // |> Array.truncate 5 | |
//(photos,students).Dump() | |
let msToSeconds (ms: int64) = | |
let seconds = ( ms / int64 1000) | |
seconds | |
// raw data had quotes in it, I didn't read it as a csv so the data was bad | |
(photos.Any(fun photo -> photo.Contains "\""), students.Any(fun s -> s.Contains "\"")).Dump() | |
printfn "%i photos, %i students" photos.Length students.Length | |
let maxI = photos.Length | |
let commaChameleon (x:float) = x.ToString("N0") | |
// andMyFriends = total ms elapsed | |
let rateMe (andMyFriends:int64) i v = | |
let seconds = msToSeconds andMyFriends | |
let rate = if seconds > 0 then int64 i / seconds else 0 | |
sprintf "Finished %s(%.2f%%) %A per second in %i seconds" (commaChameleon v) (float v / float maxI) rate seconds | |
let genericComparer (title:string) fStudents fPredicate = | |
let timer = System.Diagnostics.Stopwatch.StartNew() | |
let mutable i = 0 | |
let dumpProgress = | |
let dc = DumpContainer() | |
dc.Dump(title) | |
fun (v:int) -> dc.Content <- rateMe timer.ElapsedMilliseconds i v | |
let students = fStudents students | |
let items = | |
photos | |
|> Seq.filter(fun photo -> | |
if i % 500 = 0 then | |
Util.Progress <- i * 100 / maxI | |
dumpProgress i | |
i <- i + 1 | |
fPredicate students photo | |
) | |
|> Array.ofSeq | |
timer.Stop() | |
dumpProgress i | |
title, timer.ElapsedMilliseconds, items.Length | |
let hashContains () = | |
genericComparer "hashC" Set.ofArray (fun studs photo -> studs |> Set.exists(fun student -> student.Contains photo) |> not) | |
let hash2 () = | |
genericComparer "hash2" Set.ofArray (fun studs photo -> studs |> Set.contains photo |> not) | |
let dic () = | |
genericComparer "dic" (fun students -> students |> Seq.map(fun student -> student, student) |> Map.ofSeq) (fun studs photo -> studs |> Map.containsKey photo |> not) | |
[ | |
//seqVersion | |
//seqVersion2 | |
//hashVersion | |
hashContains | |
hash2 | |
dic | |
] | |
|> List.map (fun f -> async { return f()}) | |
|> Async.Parallel | |
|> Async.RunSynchronously |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment