From 08e9054ba4426b6be0e6948096d39b07704d9443 Mon Sep 17 00:00:00 2001
From: Roman <bundlecode@gmail.com>
Date: Fri, 9 Apr 2021 01:35:29 +0300
Subject: [PATCH] setting Input charset (#609)

* feat(driver): add Charset param

* feat(driver): add convert to UTF8 by input charset
---
 go.mod                          |  2 +-
 pkg/drivers/http/driver.go      | 21 ++++++++++-
 pkg/drivers/http/driver_test.go | 63 ++++++++++++++++++++++++++++++++-
 pkg/drivers/params.go           |  1 +
 pkg/stdlib/html/document.go     | 11 ++++++
 5 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/go.mod b/go.mod
index af4b21e0..1a0b8a6a 100644
--- a/go.mod
+++ b/go.mod
@@ -25,6 +25,6 @@ require (
 	golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
 	golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a
 	golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f // indirect
-	golang.org/x/text v0.3.2 // indirect
+	golang.org/x/text v0.3.2
 	gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
 )
diff --git a/pkg/drivers/http/driver.go b/pkg/drivers/http/driver.go
index eaf7bc1f..5e8886a4 100644
--- a/pkg/drivers/http/driver.go
+++ b/pkg/drivers/http/driver.go
@@ -4,9 +4,12 @@ import (
 	"bytes"
 	"context"
 	"github.com/gobwas/glob"
+	"io"
 	"net/http"
 	"net/url"
 
+	"golang.org/x/net/html/charset"
+
 	"github.com/PuerkitoBio/goquery"
 	"github.com/pkg/errors"
 	"github.com/sethgrid/pester"
@@ -130,8 +133,15 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
 		return nil, errors.New(resp.Status)
 	}
 
-	doc, err := goquery.NewDocumentFromReader(resp.Body)
+	body := io.Reader(resp.Body)
+	if params.Charset != "" {
+		body, err = drv.convertToUTF8(body, params.Charset)
+		if err != nil {
+			return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL)
+		}
+	}
 
+	doc, err := goquery.NewDocumentFromReader(body)
 	if err != nil {
 		return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL)
 	}
@@ -209,3 +219,12 @@ func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers
 
 	return allowed
 }
+
+func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) {
+	data, err = charset.NewReader(reader, srcCharset)
+	if err != nil {
+		return nil, err
+	}
+
+	return
+}
diff --git a/pkg/drivers/http/driver_test.go b/pkg/drivers/http/driver_test.go
index 7d9e2aa8..ea55c96f 100644
--- a/pkg/drivers/http/driver_test.go
+++ b/pkg/drivers/http/driver_test.go
@@ -1,13 +1,18 @@
 package http
 
 import (
+	"bytes"
 	"crypto/tls"
 	"github.com/MontFerret/ferret/pkg/drivers"
+	"io"
+	"io/ioutil"
 	"net/http"
 	"reflect"
 	"testing"
 	"unsafe"
 
+	"golang.org/x/text/encoding/charmap"
+
 	"github.com/smartystreets/goconvey/convey"
 )
 
@@ -106,5 +111,61 @@ func Test_newHTTPClient(t *testing.T) {
 
 		convey.So(hc, convey.ShouldNotBeNil)
 	})
-
+}
+
+func TestDriver_convertToUTF8(t *testing.T) {
+	type args struct {
+		inputData  string
+		srcCharset string
+	}
+	tests := []struct {
+		name     string
+		args     args
+		wantData io.Reader
+		expected string
+		wantErr  bool
+	}{
+		{
+			name: "should convert to expected state",
+			args: args{
+				inputData:  `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
+				srcCharset: "windows-1251",
+			},
+			wantErr:  false,
+			expected: `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			drv := &Driver{}
+
+			convey.Convey(tt.name, t, func() {
+
+				data, err := ioutil.ReadAll(bytes.NewBufferString(tt.args.inputData))
+				if err != nil {
+					panic(err)
+				}
+
+				encodedData := make([]byte, len(data)*2)
+
+				dec := charmap.Windows1251.NewEncoder()
+				nDst, _, err := dec.Transform(encodedData, data, false)
+				if err != nil {
+					panic(err)
+				}
+
+				encodedData = encodedData[:nDst]
+
+				gotData, err := drv.convertToUTF8(bytes.NewReader(encodedData), tt.args.srcCharset)
+				convey.So(err, convey.ShouldBeNil)
+
+				outData, err := ioutil.ReadAll(gotData)
+				convey.So(err, convey.ShouldBeNil)
+
+				convey.So(string(outData), convey.ShouldEqual, tt.expected)
+
+			})
+
+		})
+	}
 }
diff --git a/pkg/drivers/params.go b/pkg/drivers/params.go
index f85033df..61b7f244 100644
--- a/pkg/drivers/params.go
+++ b/pkg/drivers/params.go
@@ -31,6 +31,7 @@ type (
 		Cookies     *HTTPCookies
 		Headers     *HTTPHeaders
 		Viewport    *Viewport
+		Charset     string
 		Ignore      *Ignore
 	}
 
diff --git a/pkg/stdlib/html/document.go b/pkg/stdlib/html/document.go
index a5c72fcd..524089ff 100644
--- a/pkg/stdlib/html/document.go
+++ b/pkg/stdlib/html/document.go
@@ -50,6 +50,7 @@ type PageLoadParams struct {
 // @param {Float} [params.viewport.scaleFactor] - Viewport scale factor.
 // @param {Boolean} [params.viewport.mobile] - Value that indicates whether to emulate mobile device.
 // @param {Boolean} [params.viewport.landscape] - Value that indicates whether to render a page in landscape position.
+// @param {String} [params.charset] - (only HTTPDriver) Source charset content to convert UTF-8.
 // @return {HTMLPage} - Loaded HTML page.
 func Open(ctx context.Context, args ...core.Value) (core.Value, error) {
 	err := core.ValidateArgs(args, 1, 2)
@@ -215,6 +216,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
 
 			res.Ignore = ignore
 		}
+
+		charset, exists := obj.Get(values.NewString("charset"))
+
+		if exists {
+			if err := core.ValidateType(charset, types.String); err != nil {
+				return res, err
+			}
+
+			res.Charset = charset.String()
+		}
 	case types.String:
 		res.Driver = arg.(values.String).String()
 	case types.Boolean: