html_meta.go 2.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. package httpgetter
  2. import (
  3. "errors"
  4. "io"
  5. "net/http"
  6. "net/url"
  7. "golang.org/x/net/html"
  8. "golang.org/x/net/html/atom"
  9. )
  10. type HTMLMeta struct {
  11. Title string `json:"title"`
  12. Description string `json:"description"`
  13. Image string `json:"image"`
  14. }
  15. func GetHTMLMeta(urlStr string) (*HTMLMeta, error) {
  16. if _, err := url.Parse(urlStr); err != nil {
  17. return nil, err
  18. }
  19. response, err := http.Get(urlStr)
  20. if err != nil {
  21. return nil, err
  22. }
  23. defer response.Body.Close()
  24. mediatype, err := getMediatype(response)
  25. if err != nil {
  26. return nil, err
  27. }
  28. if mediatype != "text/html" {
  29. return nil, errors.New("not a HTML page")
  30. }
  31. htmlMeta := extractHTMLMeta(response.Body)
  32. return htmlMeta, nil
  33. }
  34. func extractHTMLMeta(resp io.Reader) *HTMLMeta {
  35. tokenizer := html.NewTokenizer(resp)
  36. htmlMeta := new(HTMLMeta)
  37. for {
  38. tokenType := tokenizer.Next()
  39. if tokenType == html.ErrorToken {
  40. break
  41. } else if tokenType == html.StartTagToken || tokenType == html.SelfClosingTagToken {
  42. token := tokenizer.Token()
  43. if token.DataAtom == atom.Body {
  44. break
  45. }
  46. if token.DataAtom == atom.Title {
  47. tokenizer.Next()
  48. token := tokenizer.Token()
  49. htmlMeta.Title = token.Data
  50. } else if token.DataAtom == atom.Meta {
  51. description, ok := extractMetaProperty(token, "description")
  52. if ok {
  53. htmlMeta.Description = description
  54. }
  55. ogTitle, ok := extractMetaProperty(token, "og:title")
  56. if ok {
  57. htmlMeta.Title = ogTitle
  58. }
  59. ogDescription, ok := extractMetaProperty(token, "og:description")
  60. if ok {
  61. htmlMeta.Description = ogDescription
  62. }
  63. ogImage, ok := extractMetaProperty(token, "og:image")
  64. if ok {
  65. htmlMeta.Image = ogImage
  66. }
  67. }
  68. }
  69. }
  70. return htmlMeta
  71. }
  72. func extractMetaProperty(token html.Token, prop string) (content string, ok bool) {
  73. content, ok = "", false
  74. for _, attr := range token.Attr {
  75. if attr.Key == "property" && attr.Val == prop {
  76. ok = true
  77. }
  78. if attr.Key == "content" {
  79. content = attr.Val
  80. }
  81. }
  82. return content, ok
  83. }