Skip to content

Instantly share code, notes, and snippets.

@jbtule
Last active November 2, 2020 07:16
Show Gist options
  • Save jbtule/83e330335b1afc924d508ce49b0927fc to your computer and use it in GitHub Desktop.
Save jbtule/83e330335b1afc924d508ce49b0927fc to your computer and use it in GitHub Desktop.
My Shortest CSV Parser
(*
* This work (My Shortest CSV Parser:CsvBasic.fs by James Tuley),
* identified by James Tuley, is free of known copyright restrictions
* Source: https://gist.github.com/jbtule/83e330335b1afc924d508ce49b0927fc
* http://creativecommons.org/publicdomain/mark/1.0/
*)
module CsvBasic
open System.IO
type SB = System.Text.StringBuilder
[<Struct>] type internal Quote = None | Open | Closed
[<Struct>] type internal CsvState = { Table: string list list; Row: string list; Cell: SB; Quote: Quote }
let parseCsv (read:TextReader) =
let empty () = { Table = []; Row = []; Cell = SB(); Quote = None }
let finalizeCell = function | { Row = []; } as cs when cs.Cell.Length = 0 -> { empty () with Table = cs.Table }
| cs -> { empty () with Table = cs.Table; Row = (cs.Cell.ToString())::cs.Row }
let finalizeRow = finalizeCell >> function | { Row = [] } as cs -> {empty () with Table = cs.Table }
| cs -> { empty () with Table = (List.rev cs.Row)::cs.Table }
let finalizeTable = finalizeRow >> fun cs -> List.rev cs.Table
seq { while read.Peek() >= 0 do yield read.Read() |> char }
|> Seq.fold (fun cs c -> let pos = lazy (cs.Table.Length + 1, cs.Row.Length + 1)
let includeWhileQuoteIs q = { cs with Cell = cs.Cell.Append(c); Quote = q }
match struct (c, cs.Cell, cs.Quote) with
| struct ('\"', sb, None) when sb.Length = 0 -> { cs with Quote = Open }
| struct ('\"', _, None) -> failwithf "Unescaped quote in cell %A" pos.Value
| struct ('\"', _, Open) -> { cs with Quote = Closed }
| struct ('\"', _, Closed) | struct ( _, _, Open) -> includeWhileQuoteIs Open
| struct ( ',', _, Closed) | struct ( ',', _, None) -> finalizeCell cs
| struct ('\r', _, Closed) | struct ('\r', _, None) -> cs
| struct ('\n', _, Closed) | struct ('\n', _, None) -> finalizeRow cs
| struct ( _, _, Closed) -> failwithf "Extra char '%c' outside of cell %A" c pos.Value
| struct ( _, _, None) -> includeWhileQuoteIs None) (empty ())
|> finalizeTable
let parseCsvFile enc path =
let stream = let s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)
new StreamReader(s, encoding = enc, detectEncodingFromByteOrderMarks = false)
using stream parseCsv
(*
* This work (My Shortest CSV Parser:CsvLazy.fs by James Tuley),
* identified by James Tuley, is free of known copyright restrictions
* Source: https://gist.github.com/jbtule/83e330335b1afc924d508ce49b0927fc
* http://creativecommons.org/publicdomain/mark/1.0/
*)
module CsvLazy
open System.IO
type SB = System.Text.StringBuilder
[<Struct>] type internal Quote = None | Open | Closed
[<Struct>] type internal CsvState = { Yield: string list; Len:int; Row: string list; Cell: SB; Quote: Quote }
let parseCsv (read:TextReader) =
let empty len = {Yield=[]; Len=len; Row = []; Cell = SB(); Quote = None }
let finalizeCell = function | { Row = []; } as cs when cs.Cell.Length = 0 -> empty cs.Len
| cs -> { empty cs.Len with Row = (cs.Cell.ToString())::cs.Row }
let finalizeRow = finalizeCell >> function | { Row = [] } as cs -> empty cs.Len
| cs -> { empty (cs.Len + 1) with Yield = (List.rev cs.Row) }
seq { let mutable ms = empty 0
for c in seq { while read.Peek() >= 0 do yield read.Read() |> char } do
let pos = lazy (ms.Len + 1, ms.Row.Length + 1)
let includeWhileQuoteIs q = { ms with Cell = ms.Cell.Append(c); Quote = q }
match struct (c, ms.Cell, ms.Quote) with
| struct ('\"', sb, None) when sb.Length = 0 -> ms <- { ms with Quote = Open }
| struct ('\"', _, None) -> failwithf "Unescaped quote in cell %A" pos.Value
| struct ('\"', _, Open) -> ms <- { ms with Quote = Closed }
| struct ('\"', _, Closed) | struct ( _, _, Open) -> ms <- includeWhileQuoteIs Open
| struct ( ',', _, Closed) | struct ( ',', _, None) -> ms <- finalizeCell ms
| struct ('\r', _, Closed) | struct ('\r', _, None) -> ()
| struct ('\n', _, Closed) | struct ('\n', _, None) -> ms <- finalizeRow ms; yield ms.Yield;
| struct ( _, _, Closed) -> failwithf "Extra char '%c' outside of cell %A" c pos.Value
| struct ( _, _, None) -> ms <- includeWhileQuoteIs None }
let parseCsvFile enc path =
seq { use stream = let s = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite)
new StreamReader(s, encoding = enc, detectEncodingFromByteOrderMarks = false)
for row in parseCsv stream do yield row }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment