Browse Source

Add dedicated ISO 8601 parser.

Jeroen van Rijn 1 year ago
parent
commit
3526042f1e
2 changed files with 180 additions and 4 deletions
  1. 113 0
      core/time/iso8061.odin
  2. 67 4
      tests/core/time/test_core_time.odin

+ 113 - 0
core/time/iso8061.odin

@@ -0,0 +1,113 @@
+package time
+// Parsing ISO 8601 date/time strings into time.Time.
+
+import dt "core:time/datetime"
+
+// Parses an ISO 8601 string and returns Time in UTC, with any UTC offset applied to it.
+// Only 4-digit years are accepted.
+// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
+// Leap seconds are smeared into 23:59:59.
+iso8601_to_time_utc :: proc(iso_datetime: string, is_leap: ^bool = nil) -> (res: Time, consumed: int) {
+	offset: int
+
+	res, offset, consumed = iso8601_to_time_and_offset(iso_datetime, is_leap)
+	res._nsec += (i64(-offset) * i64(Minute))
+	return res, consumed
+}
+
+// Parses an ISO 8601 string and returns Time and a UTC offset in minutes.
+// e.g. 1985-04-12T23:20:50.52Z
+// Note: Only 4-digit years are accepted.
+// Optional pointer to boolean `is_leap` will return `true` if the moment was a leap second.
+// Leap seconds are smeared into 23:59:59.
+iso8601_to_time_and_offset :: proc(iso_datetime: string, is_leap: ^bool = nil) -> (res: Time, utc_offset: int, consumed: int) {
+	moment, offset, leap_second, count := iso8601_to_components(iso_datetime)
+	if count == 0 {
+		return
+	}
+
+	if is_leap != nil {
+		is_leap^ = leap_second
+	}
+
+	if _res, ok := datetime_to_time(moment.year, moment.month, moment.day, moment.hour, moment.minute, moment.second, moment.nano); !ok {
+		return {}, 0, 0
+	} else {
+		return _res, offset, count
+	}
+}
+
+// Parses an ISO 8601 string and returns Time and a UTC offset in minutes.
+// e.g. 1985-04-12T23:20:50.52Z
+// Performs no validation on whether components are valid, e.g. it'll return hour = 25 if that's what it's given
+iso8601_to_components :: proc(iso_datetime: string) -> (res: dt.DateTime, utc_offset: int, is_leap: bool, consumed: int) {
+	moment, offset, count, leap_second, ok := _iso8601_to_components(iso_datetime)
+	if !ok {
+		return
+	}
+	return moment, offset, leap_second, count
+}
+
+// Parses an ISO 8601 string and returns datetime.DateTime.
+// Performs no validation on whether components are valid, e.g. it'll return hour = 25 if that's what it's given
+@(private)
+_iso8601_to_components :: proc(iso_datetime: string) -> (res: dt.DateTime, utc_offset: int, consumed: int, is_leap: bool, ok: bool) {
+	// A compliant date is at minimum 20 characters long, e.g. YYYY-MM-DDThh:mm:ssZ
+	(len(iso_datetime) >= 20) or_return
+
+	// Scan and eat YYYY-MM-DD[Tt], then scan and eat HH:MM:SS, leave separator
+	year   := scan_digits(iso_datetime[0:], "-",   4) or_return
+	month  := scan_digits(iso_datetime[5:], "-",   2) or_return
+	day    := scan_digits(iso_datetime[8:], "Tt ", 2) or_return
+	hour   := scan_digits(iso_datetime[11:], ":",  2) or_return
+	minute := scan_digits(iso_datetime[14:], ":",  2) or_return
+	second := scan_digits(iso_datetime[17:], "",   2) or_return
+	nanos  := 0
+	count  := 19
+
+	// Scan fractional seconds
+	if iso_datetime[count] == '.' {
+		count += 1 // consume '.'
+		multiplier := 100_000_000
+		for digit in iso_datetime[count:] {
+			if int(digit) >= '0' && int(digit) <= '9' {
+				nanos += int(digit - '0') * multiplier
+				multiplier /= 10
+				count += 1
+			} else {
+				break
+			}
+		}
+	}
+
+	// Leap second handling
+	if minute == 59 && second == 60 {
+		second = 59
+		is_leap = true
+	}
+
+	err: dt.Error
+	if res, err = dt.components_to_datetime(year, month, day, hour, minute, second, nanos); err != .None {
+		return {}, 0, 0, false, false
+	}
+
+	if len(iso_datetime[count:]) == 0 {
+		return res, utc_offset, count, is_leap, true
+	}
+
+	// Scan UTC offset
+	switch iso_datetime[count] {
+	case 'Z', 'z':
+		utc_offset = 0
+		count += 1
+	case '+', '-':
+		(len(iso_datetime[count:]) >= 6) or_return
+		offset_hour   := scan_digits(iso_datetime[count+1:], ":", 2) or_return
+		offset_minute := scan_digits(iso_datetime[count+4:], "",  2) or_return
+
+		utc_offset = 60 * offset_hour + offset_minute
+		utc_offset *= -1 if iso_datetime[count] == '-' else 1
+		count += 6
+	}
+	return res, utc_offset, count, is_leap, true
+}

+ 67 - 4
tests/core/time/test_core_time.odin

@@ -42,6 +42,7 @@ main :: proc() {
 	test_ordinal_date_roundtrip(&t)
 	test_component_to_time_roundtrip(&t)
 	test_parse_rfc3339_string(&t)
+	test_parse_iso8601_string(&t)
 
 	for _, leak in track.allocation_map {
 		expect(&t, false, fmt.tprintf("%v leaked %m\n", leak.location, leak.size))
@@ -91,12 +92,47 @@ RFC3339_Test :: struct{
 // These are based on RFC 3339's examples, see https://www.rfc-editor.org/rfc/rfc3339#page-10
 rfc3339_tests :: []RFC3339_Test{
 	// This represents 20 minutes and 50.52 seconds after the 23rd hour of April 12th, 1985 in UTC.
-	{"1985-04-12T23:20:50.52Z",      {482196050520000000},  true,  0,    23, false},
-	{"1985-04-12t23:20:50.52Z",      {482196050520000000},  true,  0,    23, false},
 	{"1985-04-12 23:20:50.52Z",      {482196050520000000},  true,  0,    23, false},
 	// Same, but lowercase z
 	{"1985-04-12 23:20:50.52z",      {482196050520000000},  true,  0,    23, false},
 
+	// This represents 39 minutes and 57 seconds after the 16th hour of December 19th, 1996 with an offset of -08:00 from UTC (Pacific Standard Time).
+	// Note that this is equivalent to 1996-12-20T00:39:57Z in UTC.
+	{"1996-12-19 16:39:57-08:00",    {851013597000000000},  false, -480, 25, false},
+	{"1996-12-19 16:39:57-08:00",    {851042397000000000},  true,  0,    25, false},
+	{"1996-12-20 00:39:57Z",         {851042397000000000},  false, 0,    20, false},
+
+	// This represents the leap second inserted at the end of 1990.
+	// It'll be represented as 1990-12-31 23:59:59 UTC after parsing, and `is_leap` will be set to `true`.
+	{"1990-12-31 23:59:60Z",         {662687999000000000},  true,  0,    20, true},
+
+	// This represents the same leap second in Pacific Standard Time, 8 hours behind UTC.
+	{"1990-12-31 15:59:60-08:00",    {662687999000000000},  true,  0,    25, true},
+
+	// This represents the same instant of time as noon, January 1, 1937, Netherlands time.
+	// Standard time in the Netherlands was exactly 19 minutes and 32.13 seconds ahead of UTC by law
+	// from 1909-05-01 through 1937-06-30.  This time zone cannot be represented exactly using the
+	// HH:MM format, and this timestamp uses the closest representable UTC offset.
+	{"1937-01-01 12:00:27.87+00:20", {-1041335972130000000}, false, 20,  28, false},
+	{"1937-01-01 12:00:27.87+00:20", {-1041337172130000000}, true,  0,   28, false},
+}
+
+ISO8601_Test :: struct{
+	iso_8601:     string,
+	datetime:     time.Time,
+	apply_offset: bool,
+	utc_offset:   int,
+	consumed:     int,
+	is_leap:      bool,
+}
+
+// These are based on RFC 3339's examples, see https://www.rfc-editor.org/rfc/rfc3339#page-10
+iso8601_tests :: []ISO8601_Test{
+	// This represents 20 minutes and .003362 seconds after the 23rd hour of April 12th, 1985 in UTC.
+	{"1985-04-12T23:20:50.003362",   {482196050003362000},  true,  0,    26, false},
+	{"1985-04-12t23:20:50.003362",   {482196050003362000},  true,  0,    26, false},
+	{"1985-04-12 23:20:50.003362",   {482196050003362000},  true,  0,    26, false},
+
 	// This represents 39 minutes and 57 seconds after the 16th hour of December 19th, 1996 with an offset of -08:00 from UTC (Pacific Standard Time).
 	// Note that this is equivalent to 1996-12-20T00:39:57Z in UTC.
 	{"1996-12-19T16:39:57-08:00",    {851013597000000000},  false, -480, 25, false},
@@ -114,8 +150,8 @@ rfc3339_tests :: []RFC3339_Test{
 	// Standard time in the Netherlands was exactly 19 minutes and 32.13 seconds ahead of UTC by law
 	// from 1909-05-01 through 1937-06-30.  This time zone cannot be represented exactly using the
 	// HH:MM format, and this timestamp uses the closest representable UTC offset.
-	{"1937-01-01T12:00:27.87+00:20", {-1041335972130000000}, false, 20,  28, false},
-	{"1937-01-01T12:00:27.87+00:20", {-1041337172130000000}, true,  0,   28, false},
+	{"1937-01-01 12:00:27.87+00:20", {-1041335972130000000}, false, 20,  28, false},
+	{"1937-01-01 12:00:27.87+00:20", {-1041337172130000000}, true,  0,   28, false},
 }
 
 @test
@@ -145,6 +181,33 @@ test_parse_rfc3339_string :: proc(t: ^testing.T) {
 	}
 }
 
+@test
+test_parse_iso8601_string :: proc(t: ^testing.T) {
+	for test in iso8601_tests {
+		is_leap := false
+		if test.apply_offset {
+			res, consumed := time.iso8601_to_time_utc(test.iso_8601, &is_leap)
+			msg := fmt.tprintf("[apply offet] Parsing failed: %v -> %v (nsec: %v). Expected %v consumed, got %v", test.iso_8601, res, res._nsec, test.consumed, consumed)
+			expect(t, test.consumed == consumed, msg)
+
+			if test.consumed == consumed {
+				expect(t, test.datetime == res,     fmt.tprintf("Time didn't match. Expected %v (%v), got %v (%v)", test.datetime, test.datetime._nsec, res, res._nsec))
+				expect(t, test.is_leap  == is_leap, "Expected a leap second, got none.")
+			}
+		} else {
+			res, offset, consumed := time.iso8601_to_time_and_offset(test.iso_8601)
+			msg := fmt.tprintf("Parsing failed: %v -> %v (nsec: %v), offset: %v. Expected %v consumed, got %v", test.iso_8601, res, res._nsec, offset, test.consumed, consumed)
+			expect(t, test.consumed == consumed, msg)
+
+			if test.consumed == consumed {
+				expect(t, test.datetime   == res,     fmt.tprintf("Time didn't match. Expected %v (%v), got %v (%v)", test.datetime, test.datetime._nsec, res, res._nsec))
+				expect(t, test.utc_offset == offset,  fmt.tprintf("UTC offset didn't match. Expected %v, got %v", test.utc_offset, offset))
+				expect(t, test.is_leap    == is_leap, "Expected a leap second, got none.")
+			}
+		}
+	}
+}
+
 MONTH_DAYS := []int{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}
 YEAR_START :: 1900
 YEAR_END   :: 2024