Browse Source

more robust charset processing / transforming

flashmob 6 years ago
parent
commit
b8ed1a3264
5 changed files with 118 additions and 17 deletions
  1. 52 11
      backends/s_transformer.go
  2. 1 1
      chunk/chunk_test.go
  3. 19 0
      mail/envelope.go
  4. 18 1
      mail/mime/mime.go
  5. 28 4
      mail/mime/mime_test.go

+ 52 - 11
backends/s_transformer.go

@@ -3,11 +3,12 @@ package backends
 import (
 import (
 	"bytes"
 	"bytes"
 	"github.com/flashmob/go-guerrilla/chunk/transfer"
 	"github.com/flashmob/go-guerrilla/chunk/transfer"
-	"github.com/flashmob/go-guerrilla/mail"
-	"github.com/flashmob/go-guerrilla/mail/mime"
 	"io"
 	"io"
 	"regexp"
 	"regexp"
 	"sync"
 	"sync"
+
+	"github.com/flashmob/go-guerrilla/mail"
+	"github.com/flashmob/go-guerrilla/mail/mime"
 )
 )
 
 
 // ----------------------------------------------------------------------------------
 // ----------------------------------------------------------------------------------
@@ -67,12 +68,15 @@ func (t *Transform) swap() *mime.Parts {
 
 
 // point the parts from envelope.Values back to the original ones
 // point the parts from envelope.Values back to the original ones
 func (t *Transform) unswap() {
 func (t *Transform) unswap() {
-	if parts, ok := t.envelope.Values["MimeParts"].(*mime.Parts); ok {
-		_ = parts
-		parts = t.partsCachedOriginal
+	if _, ok := t.envelope.Values["MimeParts"].(*mime.Parts); ok {
+		t.envelope.Values["MimeParts"] = t.partsCachedOriginal
 	}
 	}
 }
 }
 
 
+var regexpCharset = regexp.MustCompile("(?i)charset=\"?(.+)\"?") // (?i) is a flag for case-insensitive
+
+// todo: we may optimize this by looking at t.partsCachedOriginal, implement a Reader for it, re-write the header as we read from it
+
 func (t *Transform) ReWrite(b []byte) (count int, err error) {
 func (t *Transform) ReWrite(b []byte) (count int, err error) {
 	if !t.isBody {
 	if !t.isBody {
 		// we place the partial header's bytes on a buffer from which we can read one line at a time
 		// we place the partial header's bytes on a buffer from which we can read one line at a time
@@ -81,23 +85,50 @@ func (t *Transform) ReWrite(b []byte) (count int, err error) {
 		if i, err := io.Copy(&t.buf, bytes.NewReader(b)); err != nil {
 		if i, err := io.Copy(&t.buf, bytes.NewReader(b)); err != nil {
 			return int(i), err
 			return int(i), err
 		}
 		}
+		var charsetProcessed bool
+		charsetFrom := ""
 		for {
 		for {
 			line, rErr := t.buf.ReadBytes('\n')
 			line, rErr := t.buf.ReadBytes('\n')
+
 			if rErr == nil {
 			if rErr == nil {
+				if !charsetProcessed {
+					// is charsetFrom supported?
+					exists := t.current.Headers.Get("content-type")
+					if exists != "" {
+						charsetProcessed = true
+						charsetFrom = t.current.ContentType.Charset()
+						if !mail.SupportsCharset(charsetFrom) {
+							charsetFrom = ""
+						}
+					}
+				}
+
 				if bytes.Contains(line, []byte("Content-Transfer-Encoding: base64")) {
 				if bytes.Contains(line, []byte("Content-Transfer-Encoding: base64")) {
 					line = bytes.Replace(line, []byte("base64"), []byte("8bit"), 1)
 					line = bytes.Replace(line, []byte("base64"), []byte("8bit"), 1)
 					t.current.TransferEncoding = "8bit"
 					t.current.TransferEncoding = "8bit"
-					t.current.Charset = "utf8"
-				} else if bytes.Contains(line, []byte("charset=")) {
-					rx := regexp.MustCompile("charset=\".+?\"")
-					line = rx.ReplaceAll(line, []byte("charset=\"utf8\""))
+
+				} else if bytes.Contains(line, []byte("charset")) {
+					if match := regexpCharset.FindSubmatch(line); match != nil && len(match) > 0 {
+						// test if the encoding is supported
+						if charsetFrom != "" {
+							// it's supported, we can change it to utf8
+							line = regexpCharset.ReplaceAll(line, []byte("charset=utf8"))
+							t.current.Charset = "utf8"
+						}
+					}
 				}
 				}
 				_, err = io.Copy(t.parser, bytes.NewReader(line))
 				_, err = io.Copy(t.parser, bytes.NewReader(line))
 				if err != nil {
 				if err != nil {
 					return
 					return
 				}
 				}
+				if line[0] == '\n' {
+					// end of header
+					break
+				}
 			} else {
 			} else {
-				break
+				// returned data does not end in delim
+				panic("returned data does not end in delim")
+				//break
 			}
 			}
 		}
 		}
 	} else {
 	} else {
@@ -107,7 +138,17 @@ func (t *Transform) ReWrite(b []byte) (count int, err error) {
 		if t.decoder == nil {
 		if t.decoder == nil {
 			t.buf.Reset()
 			t.buf.Reset()
 			// the decoder will be reading from an underlying pipe
 			// the decoder will be reading from an underlying pipe
-			t.decoder, err = transfer.NewBodyDecoder(t.pr, transfer.Base64, "iso-8859-1")
+			charsetFrom := t.current.ContentType.Charset()
+			if charsetFrom == "" {
+				charsetFrom = mail.MostCommonCharset
+			}
+			if mail.SupportsCharset(charsetFrom) {
+				t.decoder, err = transfer.NewBodyDecoder(t.pr, transfer.Base64, charsetFrom)
+			}
+			if err != nil {
+				return
+			}
+
 		}
 		}
 
 
 		wg := sync.WaitGroup{}
 		wg := sync.WaitGroup{}

+ 1 - 1
chunk/chunk_test.go

@@ -192,7 +192,7 @@ MIME-Version: 1.0
 To: "Nevaeh" <[email protected]>
 To: "Nevaeh" <[email protected]>
 Subject: czy m�glbys spotkac sie ze mna w weekend?
 Subject: czy m�glbys spotkac sie ze mna w weekend?
 Content-Type: text/html;
 Content-Type: text/html;
-	charset="iso-8859-1"
+	charset="iso-8859-1""
 Content-Transfer-Encoding: base64
 Content-Transfer-Encoding: base64
 
 
 PCFkb2N0eXBlIGh0bWw+DQo8aHRtbD4NCjxoZWFkPg0KPG1ldGEgY2hhcnNldD0idXRmLTgiPg0K
 PCFkb2N0eXBlIGh0bWw+DQo8aHRtbD4NCjxoZWFkPg0KPG1ldGEgY2hhcnNldD0idXRmLTgiPg0K

+ 19 - 0
mail/envelope.go

@@ -285,3 +285,22 @@ func (p *Pool) Return(e *Envelope) {
 	// take a value off the semaphore to make room for more envelopes
 	// take a value off the semaphore to make room for more envelopes
 	<-p.sem
 	<-p.sem
 }
 }
+
+const MostCommonCharset = "ISO-8859-1"
+
+var supportedEncodingsCharsets map[string]bool
+
+func SupportsCharset(charset string) bool {
+	if supportedEncodingsCharsets == nil {
+		supportedEncodingsCharsets = make(map[string]bool)
+	} else if ok, result := supportedEncodingsCharsets[charset]; ok {
+		return result
+	}
+	_, err := Dec.CharsetReader(charset, bytes.NewReader([]byte{}))
+	if err != nil {
+		supportedEncodingsCharsets[charset] = false
+		return false
+	}
+	supportedEncodingsCharsets[charset] = true
+	return true
+}

+ 18 - 1
mail/mime/mime.go

@@ -16,6 +16,7 @@ import (
 	"io"
 	"io"
 	"net/textproto"
 	"net/textproto"
 	"strconv"
 	"strconv"
+	"strings"
 	"sync"
 	"sync"
 )
 )
 
 
@@ -180,6 +181,19 @@ func (c *contentType) String() (ret string) {
 	return
 	return
 }
 }
 
 
+// Charset returns the charset value specified by the content type
+func (c *contentType) Charset() (ret string) {
+	if c.superType == "" {
+		return ""
+	}
+	for i := range c.parameters {
+		if c.parameters[i].name == "charset" {
+			return c.parameters[i].value
+		}
+	}
+	return ""
+}
+
 func newPart() *Part {
 func newPart() *Part {
 	mh := new(Part)
 	mh := new(Part)
 	mh.Headers = make(textproto.MIMEHeader, 1)
 	mh.Headers = make(textproto.MIMEHeader, 1)
@@ -499,7 +513,7 @@ func (p *Parser) header(mh *Part) (err error) {
 							return errors.New("boundary exceeded max length")
 							return errors.New("boundary exceeded max length")
 						}
 						}
 					case contentType.parameters[i].name == "charset":
 					case contentType.parameters[i].name == "charset":
-						mh.Charset = contentType.parameters[i].value
+						mh.Charset = strings.ToUpper(contentType.parameters[i].value)
 					case contentType.parameters[i].name == "name":
 					case contentType.parameters[i].name == "name":
 						mh.ContentName = contentType.parameters[i].value
 						mh.ContentName = contentType.parameters[i].value
 					}
 					}
@@ -604,6 +618,9 @@ func (p *Parser) contentType() (result contentType, err error) {
 			if key, val, err := p.parameter(); err != nil {
 			if key, val, err := p.parameter(); err != nil {
 				return result, err
 				return result, err
 			} else {
 			} else {
+				if key == "charset" {
+					val = strings.ToUpper(val)
+				}
 				// add the new parameter
 				// add the new parameter
 				result.parameters = append(result.parameters, parameter{key, val})
 				result.parameters = append(result.parameters, parameter{key, val})
 			}
 			}

+ 28 - 4
mail/mime/mime_test.go

@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"fmt"
 	"io"
 	"io"
 	"strconv"
 	"strconv"
+	"strings"
 	"testing"
 	"testing"
 	"time"
 	"time"
 )
 )
@@ -60,13 +61,39 @@ func TestMimeContentType(t *testing.T) {
 		<-p.consumed
 		<-p.consumed
 		p.gotNewSlice <- false
 		p.gotNewSlice <- false
 	}()
 	}()
-	subject := "text/plain; charset=\"us-ascii\"; moo; boundary=\"foo\""
+
+	// what happens if we call Charset with empty content type?
+	empty := contentType{}
+	blank := empty.Charset()
+	if blank != "" {
+		t.Error("expecting charset to be blank")
+	}
+
+	subject := "text/plain; charset=\"us-aScii\"; moo; boundary=\"foo\""
 	p.inject([]byte(subject))
 	p.inject([]byte(subject))
 	contentType, err := p.contentType()
 	contentType, err := p.contentType()
 	if err != nil {
 	if err != nil {
 		t.Error(err)
 		t.Error(err)
 	}
 	}
 
 
+	if charset := contentType.Charset(); charset != "US-ASCII" {
+		t.Error("charset is not US-ASCII")
+	}
+
+	// test the stringer (note it will canonicalize us-aScii to US-ASCII
+	subject = strings.Replace(subject, "us-aScii", "US-ASCII", 1)
+	if ct := contentType.String(); contentType.String() != subject {
+		t.Error("\n[" + ct + "]\ndoes not equal\n[" + subject + "]")
+	}
+
+	// what happens if we don't use quotes for the param?
+	subject = "text/plain; charset=us-aScii; moo; boundary=\"foo\""
+	p.inject([]byte(subject))
+	contentType, err = p.contentType()
+	if err != nil {
+		t.Error(err)
+	}
+
 	if contentType.subType != "plain" {
 	if contentType.subType != "plain" {
 		t.Error("contentType.subType expecting 'plain', got:", contentType.subType)
 		t.Error("contentType.subType expecting 'plain', got:", contentType.subType)
 	}
 	}
@@ -75,9 +102,6 @@ func TestMimeContentType(t *testing.T) {
 		t.Error("contentType.subType expecting 'text', got:", contentType.superType)
 		t.Error("contentType.subType expecting 'text', got:", contentType.superType)
 	}
 	}
 
 
-	if ct := contentType.String(); contentType.String() != subject {
-		t.Error("\n[" + ct + "]\ndoes not equal\n[" + subject + "]")
-	}
 }
 }
 
 
 func TestEmailHeader(t *testing.T) {
 func TestEmailHeader(t *testing.T) {