Attribute extractor added. HTML extractor added. Outer HTML Extractor added.

exporter package renamed to export, extractor package renamed to extract for simplicity.
This commit is contained in:
Musab Gültekin
2019-06-30 22:20:17 +03:00
parent 7c383b175f
commit 0eda056065
12 changed files with 115 additions and 31 deletions

View File

@@ -7,8 +7,8 @@ import (
"github.com/fpfeng/httpcache"
"github.com/geziyor/geziyor"
"github.com/geziyor/geziyor/client"
"github.com/geziyor/geziyor/exporter"
"github.com/geziyor/geziyor/extractor"
"github.com/geziyor/geziyor/export"
"github.com/geziyor/geziyor/extract"
"github.com/geziyor/geziyor/metrics"
"net/http"
"net/http/httptest"
@@ -44,7 +44,7 @@ func TestQuotes(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"http://quotes.toscrape.com/"},
ParseFunc: quotesParse,
Exporters: []geziyor.Exporter{&exporter.JSON{}},
Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start()
}
@@ -84,7 +84,7 @@ func TestAllLinks(t *testing.T) {
}
})
},
Exporters: []geziyor.Exporter{&exporter.CSV{}},
Exporters: []geziyor.Exporter{&export.CSV{}},
MetricsType: metrics.Prometheus,
}).Start()
}
@@ -99,7 +99,7 @@ func TestStartRequestsFunc(t *testing.T) {
g.Exports <- s.AttrOr("href", "")
})
},
Exporters: []geziyor.Exporter{&exporter.JSON{}},
Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start()
}
@@ -163,12 +163,15 @@ func TestExtractor(t *testing.T) {
geziyor.NewGeziyor(&geziyor.Options{
StartURLs: []string{"https://www.theverge.com/2019/6/27/18760384/facebook-libra-currency-cryptocurrency-money-transfer-bank-problems-india-china"},
Extractors: []geziyor.Extractor{
&extractor.Text{Name: "title", Selector: ".c-page-title"},
&extractor.Text{Name: "byline", Selector: ".c-byline__item:nth-child(1) > a"},
&extractor.Text{Name: "summary", Selector: ".c-entry-summary"},
&extractor.Text{Name: "content", Selector: ".c-entry-content"},
&extract.HTML{Name: "entry_html", Selector: ".c-entry-hero__content"},
&extract.Text{Name: "title", Selector: ".c-page-title"},
&extract.OuterHTML{Name: "title_html", Selector: ".c-page-title"},
&extract.Text{Name: "author", Selector: ".c-byline__item:nth-child(1) > a"},
&extract.Attr{Name: "author_url", Selector: ".c-byline__item:nth-child(1) > a", Attr: "href"},
&extract.Text{Name: "summary", Selector: ".c-entry-summary"},
&extract.Text{Name: "content", Selector: ".c-entry-content"},
},
Exporters: []geziyor.Exporter{&exporter.JSON{}},
Exporters: []geziyor.Exporter{&export.JSON{}},
}).Start()
}