mirror of
https://github.com/MontFerret/ferret.git
synced 2025-03-17 21:18:37 +02:00
setting Input charset (#609)
* feat(driver): add Charset param * feat(driver): add convert to UTF8 by input charset
This commit is contained in:
parent
6645ff1521
commit
08e9054ba4
2
go.mod
2
go.mod
@ -25,6 +25,6 @@ require (
|
||||
golang.org/x/net v0.0.0-20200421231249-e086a090c8fd
|
||||
golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a
|
||||
golang.org/x/sys v0.0.0-20200420163511-1957bb5e6d1f // indirect
|
||||
golang.org/x/text v0.3.2 // indirect
|
||||
golang.org/x/text v0.3.2
|
||||
gopkg.in/natefinch/lumberjack.v2 v2.0.0 // indirect
|
||||
)
|
||||
|
@ -4,9 +4,12 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"github.com/gobwas/glob"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sethgrid/pester"
|
||||
@ -130,8 +133,15 @@ func (drv *Driver) Open(ctx context.Context, params drivers.Params) (drivers.HTM
|
||||
return nil, errors.New(resp.Status)
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
||||
body := io.Reader(resp.Body)
|
||||
if params.Charset != "" {
|
||||
body, err = drv.convertToUTF8(body, params.Charset)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "failed convert to UTF-8 a document %s", params.URL)
|
||||
}
|
||||
}
|
||||
|
||||
doc, err := goquery.NewDocumentFromReader(body)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "failed to parse a document %s", params.URL)
|
||||
}
|
||||
@ -209,3 +219,12 @@ func (drv *Driver) responseCodeAllowed(resp *http.Response, additional []drivers
|
||||
|
||||
return allowed
|
||||
}
|
||||
|
||||
func (drv *Driver) convertToUTF8(reader io.Reader, srcCharset string) (data io.Reader, err error) {
|
||||
data, err = charset.NewReader(reader, srcCharset)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
@ -1,13 +1,18 @@
|
||||
package http
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"github.com/MontFerret/ferret/pkg/drivers"
|
||||
"io"
|
||||
"io/ioutil"
|
||||
"net/http"
|
||||
"reflect"
|
||||
"testing"
|
||||
"unsafe"
|
||||
|
||||
"golang.org/x/text/encoding/charmap"
|
||||
|
||||
"github.com/smartystreets/goconvey/convey"
|
||||
)
|
||||
|
||||
@ -106,5 +111,61 @@ func Test_newHTTPClient(t *testing.T) {
|
||||
|
||||
convey.So(hc, convey.ShouldNotBeNil)
|
||||
})
|
||||
|
||||
}
|
||||
|
||||
func TestDriver_convertToUTF8(t *testing.T) {
|
||||
type args struct {
|
||||
inputData string
|
||||
srcCharset string
|
||||
}
|
||||
tests := []struct {
|
||||
name string
|
||||
args args
|
||||
wantData io.Reader
|
||||
expected string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "should convert to expected state",
|
||||
args: args{
|
||||
inputData: `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
|
||||
srcCharset: "windows-1251",
|
||||
},
|
||||
wantErr: false,
|
||||
expected: `<!DOCTYPE html><html><head><meta charset="windows-1251"/></head><body>феррет</body></html>`,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
drv := &Driver{}
|
||||
|
||||
convey.Convey(tt.name, t, func() {
|
||||
|
||||
data, err := ioutil.ReadAll(bytes.NewBufferString(tt.args.inputData))
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
encodedData := make([]byte, len(data)*2)
|
||||
|
||||
dec := charmap.Windows1251.NewEncoder()
|
||||
nDst, _, err := dec.Transform(encodedData, data, false)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
||||
encodedData = encodedData[:nDst]
|
||||
|
||||
gotData, err := drv.convertToUTF8(bytes.NewReader(encodedData), tt.args.srcCharset)
|
||||
convey.So(err, convey.ShouldBeNil)
|
||||
|
||||
outData, err := ioutil.ReadAll(gotData)
|
||||
convey.So(err, convey.ShouldBeNil)
|
||||
|
||||
convey.So(string(outData), convey.ShouldEqual, tt.expected)
|
||||
|
||||
})
|
||||
|
||||
})
|
||||
}
|
||||
}
|
||||
|
@ -31,6 +31,7 @@ type (
|
||||
Cookies *HTTPCookies
|
||||
Headers *HTTPHeaders
|
||||
Viewport *Viewport
|
||||
Charset string
|
||||
Ignore *Ignore
|
||||
}
|
||||
|
||||
|
@ -50,6 +50,7 @@ type PageLoadParams struct {
|
||||
// @param {Float} [params.viewport.scaleFactor] - Viewport scale factor.
|
||||
// @param {Boolean} [params.viewport.mobile] - Value that indicates whether to emulate mobile device.
|
||||
// @param {Boolean} [params.viewport.landscape] - Value that indicates whether to render a page in landscape position.
|
||||
// @param {String} [params.charset] - (only HTTPDriver) Source charset content to convert UTF-8.
|
||||
// @return {HTMLPage} - Loaded HTML page.
|
||||
func Open(ctx context.Context, args ...core.Value) (core.Value, error) {
|
||||
err := core.ValidateArgs(args, 1, 2)
|
||||
@ -215,6 +216,16 @@ func newPageLoadParams(url values.String, arg core.Value) (PageLoadParams, error
|
||||
|
||||
res.Ignore = ignore
|
||||
}
|
||||
|
||||
charset, exists := obj.Get(values.NewString("charset"))
|
||||
|
||||
if exists {
|
||||
if err := core.ValidateType(charset, types.String); err != nil {
|
||||
return res, err
|
||||
}
|
||||
|
||||
res.Charset = charset.String()
|
||||
}
|
||||
case types.String:
|
||||
res.Driver = arg.(values.String).String()
|
||||
case types.Boolean:
|
||||
|
Loading…
x
Reference in New Issue
Block a user