-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
eb0d852
commit fd39ce1
Showing
17 changed files
with
5,701 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 16 | ||
VisualStudioVersion = 16.0.28729.10 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BookmarkingStreamReader", "src\BookmarkingStreamReader\BookmarkingStreamReader.csproj", "{2315E405-2583-4757-BEDC-58FB251A4D81}" | ||
EndProject | ||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "BookmarkingStreamReader.Test", "tests\BookmarkingStreamReader.Test\BookmarkingStreamReader.Test.csproj", "{3877A99D-F810-4684-A1DF-965561AD230C}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{2315E405-2583-4757-BEDC-58FB251A4D81}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{2315E405-2583-4757-BEDC-58FB251A4D81}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{2315E405-2583-4757-BEDC-58FB251A4D81}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{2315E405-2583-4757-BEDC-58FB251A4D81}.Release|Any CPU.Build.0 = Release|Any CPU | ||
{3877A99D-F810-4684-A1DF-965561AD230C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{3877A99D-F810-4684-A1DF-965561AD230C}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{3877A99D-F810-4684-A1DF-965561AD230C}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{3877A99D-F810-4684-A1DF-965561AD230C}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {BF65E27C-84BF-49CC-807F-2DA288D8FDF7} | ||
EndGlobalSection | ||
EndGlobal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
The MIT License (MIT) | ||
|
||
Copyright 2019 Treetop Innovation AB and original contributors | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | ||
|
||
|
||
Based on code from .NET Core and .NET Framework, under the same license and with this attribution: | ||
|
||
Copyright (c) .NET Foundation and Contributors | ||
|
||
All rights reserved. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# Bookmarking Stream Reader | ||
|
||
Files are easy to read with the .NET BCL StreamReader, but only once or all-at-once. If you read one line at a time whenever a file has changed and wish to resume at a known position, StreamReader's buffer and other factors prevent you from knowing the exact position. Since the line-reading operations also chop off the line ending, it's hard to know if you've read an incomplete line, or which line ending is being used. There are also other factors - see "The gory details" below. | ||
|
||
Bookmarking Stream Reader is a fork of the StreamReaders from the .NET BCL, extended with the necessary tracking to provide "bookmarks", which can be used to seek the stream to the right position and continue reading, assuming the beginning of the file hasn't changed. | ||
|
||
Like the source material, it is available under the MIT license. | ||
|
||
## NuGet package | ||
|
||
...is not available yet. For now, clone the repo/download the current source. | ||
|
||
|
||
## Supported frameworks | ||
|
||
We provide a version for .NET Core 2.2, based on the StreamReader from .NET Core 2.2, and a version for .NET Framework 4.0 and above, based on the StreamReader from .NET Framework 4.7.2. | ||
|
||
## What's inside | ||
|
||
Bookmarking Stream Reader provides: | ||
|
||
* A **LineBookmark** struct, noting the number of bytes and `char`s already read. | ||
* A **ReadDetailedLine** method, returning detailed information about the line, including: | ||
* the line break characters used, if any (supported: none/EOF, `\r` (CR), `\r\n` (CRLF), `\n` (LF)) | ||
* a LineBookmark for the position before reading the line, suitable for resuming at the point before the line was read, to re-read the line | ||
* a LineBookmark for the position after reading the line including line break, suitable for resuming at the point after the line was read, to read the next line | ||
* the text, with and without the line break characters | ||
* A **ResumeFromBookmark** method to seek to the position of a LineBookmark and use its character index. | ||
* A **ResumeFromBeginning** method to seek to the beginning of a stream and dump all character tracking information. | ||
* Character tracking implementation for all single-byte encodings (including ASCII, Windows Latin-1 and ISO 8859-1) as well as UTF-8. This logic works out which `char` offset corresponds to which byte index in the buffer, which along with knowing the previous number of `char`s and bytes provide absolute offsets. | ||
|
||
## Sample usage | ||
|
||
```csharp | ||
// Dispose and error handling omitted | ||
// file contents: | ||
// abcdef\r\n | ||
// xyzzy\n | ||
// foobar | ||
const string pathToFile = ...; | ||
|
||
// Read the first line | ||
var bsr = new BookmarkingStreamReader(new FileStream(pathToFile), Encoding.UTF8); | ||
|
||
var firstLine = bsr.ReadDetailedLine().Value; | ||
firstLine.TextWithoutLineEnding // => "abcdef" | ||
firstLine.TextWithLineEnding // => "abcdef\r\n" | ||
firstLine.BookmarkingLineEnding // => BookmarkingLineEnding.CarriageReturnLineFeed | ||
// Create a bookmark for resuming at the current position | ||
var bookmarkForNextLine = firstLine.MakeBookmarkForReadingNextLine(); | ||
|
||
|
||
// Create a new reader and resume it at the position. | ||
// (Take care not to reuse the streams! Each stream has its own position.) | ||
var bsr2 = new BookmarkingStreamReader(new FileStream(pathToFile), Encoding.UTF8); | ||
bsr2.ResumeFromBookmark(bookmarkForNextLine); | ||
|
||
var secondLine = bsr2.ReadDetailedLine().Value; | ||
secondLine.TextWithoutLineEnding // => "xyzzy" | ||
secondLine.TextWithLineEnding // => "xyzzy\n" | ||
secondLine.BookmarkingLineEnding // => BookmarkingLineEnding.LineFeed | ||
var finalLine = bsr2.ReadDetailedLine().Value; | ||
finalLine.TextWithoutLineEnding // => "foobar" | ||
finalLine.TextWithLineEnding // => "foobar" | ||
finalLine.BookmarkingLineEnding // => BookmarkingLineEnding.None | ||
// Like ReadLine(), null signals end of file (nothing more left to read) | ||
var lineAfterFinalLine = bsr2.ReadDetailedLine(); | ||
|
||
lineAfterFinalLine == null // => true | ||
|
||
// Create a bookmark for resuming at the beginning of the final line. | ||
// You may want to re-read a line if you expect partial data: | ||
// - it didn't have the expected data | ||
// - it didn't have a line ending and it should have | ||
// - it had \r when you expected \r\n | ||
// etc... | ||
var bookmarkForReReadingFinalLine = finalLine.MakeBookmarkForReReadingLine(); | ||
|
||
var bsr3 = new BookmarkingStreamReader(new FileStream(pathToFile), Encoding.UTF8); | ||
bsr3.ResumeFromBookmark(bookmarkForReReadingFinalLine); | ||
|
||
var finalLineAgain = bsr3.ReadDetailedLine().Value; | ||
finalLineAgain.TextWithoutLineEnding // => "foobar" | ||
finalLineAgain.TextWithLineEnding // => "foobar" | ||
finalLineAgain.BookmarkingLineEnding // => BookmarkingLineEnding.None | ||
``` | ||
|
||
## Status | ||
|
||
The reader is under development. Regular use with files without invalid contents (like invalid multi-byte sequences) should work fine. | ||
|
||
Already done: | ||
* Making sure split characters, where a multi-byte sequence straddles a buffer boundary, does not confuse or offset the tracking. (The bookmark should never point into the middle of a split character.) | ||
* Making sure BOM (byte order marks) are handled coherently. | ||
* Hiding incompatible or unimplemented methods. You can't Read from the reader other than through ReadDetailedLine, because of the extra tracking that needs to happen when the buffer is reinitialized. Peek is potentially harmless but is hidden for consistency. | ||
|
||
|
||
Next up: | ||
* Testing all invariants. | ||
* Provide NuGet package. | ||
* Making sure recovery from invalid characters in the underlying Encoding instance doesn't desync. | ||
|
||
|
||
In the future, we may want to provide: | ||
* Character tracking information for UTF-16 and possibly other multi-byte encodings. | ||
* An implementation of the asynchronous methods. | ||
* Transparent support for grabbing a few bytes around the beginning of the bookmark and/or the beginning of the file to validate having seeked to the right position, in case the file was rewritten to truncate information at the beginning. | ||
* Transparent detection of something else having seeked the stream in the background and throwing an exception. Keeping track of the character index requires seeking the stream from the beginning or resuming from a bookmark with this information. | ||
* Reading other elements than lines. | ||
* Reading in reverse. | ||
|
||
|
||
## The gory details - "why is this so hard?" | ||
|
||
To begin with, there's the StreamReader buffer, filled with chunks of bytes from disk to prevent reading everything directly from disk. This is what prevents directly reading the stream's position. ReadLine also chops off the line breaking characters, if present, which also offsets the position in unpredictable ways. | ||
|
||
But the biggest problem is that StreamReader converts the buffers of bytes to buffers of `char` (System.Char), which are UTF-16 code points, which will not align with the byte offset even for UTF-16-encoded text (since bytes are 8-bit integers and UTF-16 code points are 16-bit integers). Keeping track of the last read position requires tracking the byte offset across the actual encoding in sync with the buffer being filled. | ||
|
||
For example, the string `AZ✨💩123` stored on disk as UTF-8 consists of: | ||
|
||
character: ---A--- ---Z--- ---✨--------- ---💩--------- ---1--- ---2--- ---3--- | ||
Unicode character: U+0041 U+005A U+2728 U+1F4A9 U+0031 U+0032 U+0033 | ||
(index) 0 1 2 3 4 5 6 | ||
(A) UTF-16 code points: 0041 005A 2728 D83D DCA9 0031 0032 0033 | ||
(index) 0 1 2 3 4 5 6 7 | ||
UTF-16 bytes: 00 41 00 5A 27 28 D8 3D DC A9 00 31 00 32 00 33 | ||
(index) 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | ||
|
||
(B) UTF-8 code points: 41 5A E2 9C A8 F0 9F 92 A9 31 32 33 | ||
and bytes (index) 0 1 2 3 4 5 6 7 8 9 10 11 | ||
|
||
|
||
Being able to resume at point X requires knowing not only the position in the stream of bytes ((B) in this case), but also the number of previous characters, the position in the stream of UTF-16 code points, (A). | ||
|
||
Bookmarking Stream Reader keeps enough information to solve this. Among other things, this involves going through the byte/character buffers by the stream reader in order to work out which code points have been read. In the code, this is referred to as working out the "byte advancement info". For single byte encodings, 1 char that could be read from that file always equals 1 byte*, while for UTF-8, it depends on the data. With this information, the number of bytes and the number of chars seen in the file up to this point can be kept, and this is the information in the bookmark. | ||
|
||
(\* given that no value represents a Unicode character not representable in a single UTF-16 code point; in our testing, this holds for all values in all single-byte encodings supported by .NET Core with System.Text.Encoding.CodePages and .NET Framework) | ||
|
||
## Why a .NET Framework 4 version? | ||
|
||
.NET Core's requirements exclude some of the Windows Server versions we want to support. If you can use .NET Core, use .NET Core. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
|
||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 16 | ||
VisualStudioVersion = 16.0.28729.10 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BookmarkingStreamReader4", "src\BookmarkingStreamReader4\BookmarkingStreamReader4.csproj", "{108F9788-DAB3-4F99-8EC3-82F866693A8E}" | ||
EndProject | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "BookmarkingStreamReader4.Tests", "tests\BookmarkingStreamReader4.Tests\BookmarkingStreamReader4.Tests.csproj", "{4911F75F-EA21-41AC-B10B-5DA16E56C9A4}" | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{108F9788-DAB3-4F99-8EC3-82F866693A8E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{108F9788-DAB3-4F99-8EC3-82F866693A8E}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{108F9788-DAB3-4F99-8EC3-82F866693A8E}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{108F9788-DAB3-4F99-8EC3-82F866693A8E}.Release|Any CPU.Build.0 = Release|Any CPU | ||
{4911F75F-EA21-41AC-B10B-5DA16E56C9A4}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{4911F75F-EA21-41AC-B10B-5DA16E56C9A4}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{4911F75F-EA21-41AC-B10B-5DA16E56C9A4}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{4911F75F-EA21-41AC-B10B-5DA16E56C9A4}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {B27CCC6B-5A3B-40CC-9518-68052680D72F} | ||
EndGlobalSection | ||
EndGlobal |
51 changes: 51 additions & 0 deletions
51
net40/src/BookmarkingStreamReader4/BookmarkingStreamReader4.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
<Project ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> | ||
<Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" /> | ||
<PropertyGroup> | ||
<Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration> | ||
<Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform> | ||
<ProjectGuid>{108F9788-DAB3-4F99-8EC3-82F866693A8E}</ProjectGuid> | ||
<OutputType>Library</OutputType> | ||
<AppDesignerFolder>Properties</AppDesignerFolder> | ||
<RootNamespace>BookmarkingStreamReader4</RootNamespace> | ||
<AssemblyName>BookmarkingStreamReader4</AssemblyName> | ||
<TargetFrameworkVersion>v4.0</TargetFrameworkVersion> | ||
<FileAlignment>512</FileAlignment> | ||
<Deterministic>true</Deterministic> | ||
</PropertyGroup> | ||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' "> | ||
<DebugSymbols>true</DebugSymbols> | ||
<DebugType>full</DebugType> | ||
<Optimize>false</Optimize> | ||
<OutputPath>bin\Debug\</OutputPath> | ||
<DefineConstants>DEBUG;TRACE</DefineConstants> | ||
<ErrorReport>prompt</ErrorReport> | ||
<WarningLevel>4</WarningLevel> | ||
</PropertyGroup> | ||
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> | ||
<DebugType>pdbonly</DebugType> | ||
<Optimize>true</Optimize> | ||
<OutputPath>bin\Release\</OutputPath> | ||
<DefineConstants>TRACE</DefineConstants> | ||
<ErrorReport>prompt</ErrorReport> | ||
<WarningLevel>4</WarningLevel> | ||
</PropertyGroup> | ||
<ItemGroup> | ||
<Reference Include="System" /> | ||
<Reference Include="System.Core" /> | ||
<Reference Include="System.Xml.Linq" /> | ||
<Reference Include="System.Data.DataSetExtensions" /> | ||
<Reference Include="Microsoft.CSharp" /> | ||
<Reference Include="System.Data" /> | ||
<Reference Include="System.Xml" /> | ||
</ItemGroup> | ||
<ItemGroup> | ||
<Compile Include="..\..\..\src\BookmarkingStreamReader\BookmarkingSmarts.cs"> | ||
<Link>BookmarkingSmarts.cs</Link> | ||
</Compile> | ||
<Compile Include="StreamReader.cs" /> | ||
<Compile Include="TextReader.cs" /> | ||
<Compile Include="Properties\AssemblyInfo.cs" /> | ||
</ItemGroup> | ||
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" /> | ||
</Project> |
36 changes: 36 additions & 0 deletions
36
net40/src/BookmarkingStreamReader4/Properties/AssemblyInfo.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
using System.Reflection; | ||
using System.Runtime.CompilerServices; | ||
using System.Runtime.InteropServices; | ||
|
||
// General Information about an assembly is controlled through the following | ||
// set of attributes. Change these attribute values to modify the information | ||
// associated with an assembly. | ||
[assembly: AssemblyTitle("BookmarkingStreamReader4")] | ||
[assembly: AssemblyDescription("")] | ||
[assembly: AssemblyConfiguration("")] | ||
[assembly: AssemblyCompany("")] | ||
[assembly: AssemblyProduct("BookmarkingStreamReader4")] | ||
[assembly: AssemblyCopyright("Copyright © 2019")] | ||
[assembly: AssemblyTrademark("")] | ||
[assembly: AssemblyCulture("")] | ||
|
||
// Setting ComVisible to false makes the types in this assembly not visible | ||
// to COM components. If you need to access a type in this assembly from | ||
// COM, set the ComVisible attribute to true on that type. | ||
[assembly: ComVisible(false)] | ||
|
||
// The following GUID is for the ID of the typelib if this project is exposed to COM | ||
[assembly: Guid("108f9788-dab3-4f99-8ec3-82f866693a8e")] | ||
|
||
// Version information for an assembly consists of the following four values: | ||
// | ||
// Major Version | ||
// Minor Version | ||
// Build Number | ||
// Revision | ||
// | ||
// You can specify all the values or you can default the Build and Revision Numbers | ||
// by using the '*' as shown below: | ||
// [assembly: AssemblyVersion("1.0.*")] | ||
[assembly: AssemblyVersion("1.0.0.0")] | ||
[assembly: AssemblyFileVersion("1.0.0.0")] |
Oops, something went wrong.