云迈博客

您现在的位置是:首页 > 后端开发 > PHP > 正文

PHP

淘宝商品采集

guang2020-11-26PHP408
该方法是使用页面抓取然后请求的形式获取商品详情。具体用到了curl和phpQuery。采集淘宝商品有被封ip风险,谨慎使用。淘宝页面规则是一直更新的,所以抓取规则也需要对应当时的页面规则。获取淘宝商品

该方法是使用页面抓取然后请求的形式获取商品详情。具体用到了curl和phpQuery。
采集淘宝商品有被封ip风险,谨慎使用。
淘宝页面规则是一直更新的,所以抓取规则也需要对应当时的页面规则。
获取淘宝商品的信息,传入淘宝商品id,中间可根据自己的业务逻辑来判断或处理一下商品。
(建议模仿人人商城的商品采集修改)

public function get_item_taobao($itemid = '', $taobaourl = '', $cates = '', $merchid = 0)
{
  global $_W;
  error_reporting(0);
  // $g = pdo_fetch('select * from ' . tablename('ewei_shop_goods') . ' where uniacid=:uniacid and merchid=:merchid and catch_id=:catch_id and catch_source=\'taobao\' limit 1', array(':uniacid' => $_W['uniacid'], ':merchid' => $merchid, ':catch_id' => $itemid));
  // $item = array();
  // $item['id'] = $g['id'];
  // $item['merchid'] = $merchid;

  // if (!empty($merchid)) {
  //   if (empty($_W['merch_user']['goodschecked'])) {
  //     $item['checked'] = 1;
  //   }
  //   else {
  //     $item['checked'] = 0;
  //   }
  // }

  $url = $this->get_tmall_page_url($itemid);

  $this->ihttp_func = new App_Func_Ihttp(1);

  $response = $this->ihttp_func->ihttp_get($url);

  $length = strval($response['headers']['Content-Length']);

  if ($length != NULL) {
    return array('result' => '0', 'error' => '未从淘宝获取到商品信息!');
  }
  // file_put_contents('11.txt', $response);
 // var_dump($response);exit;
  $content = $response['content'];
  // var_dump($content);exit;
  if (function_exists('mb_convert_encoding')) {
    $content = mb_convert_encoding($content, 'UTF-8', 'UTF-8,GBK,GB2312,BIG5');
  }
  // var_dump($content);exit;
  if ($this->ihttp_func->strexists($response['content'], 'ERRCODE_QUERY_DETAIL_FAIL')) {
    return array('result' => '0', 'error' => '宝贝不存在!');
  }
  include $_SERVER['DOCUMENT_ROOT'].'/public/vendor/phpquery/phpQuery.php';
  // include_once('public/vendor/phpquery/phpQuery.php');
  $dom = new DOMDocument();
  $dom->loadHTML('<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>' . $content);
  $xml = simplexml_import_dom($dom);
  // var_dump($xml);exit;
  preg_match('/var g_config\\s*=(.*);/isU', $content, $match);
  $matchOne = str_replace(array(' ', '
', '
', '    '), array(''), $match[1]);
  $erdr = substr($matchOne, stripos($matchOne, 'sibUrl'));
  $erdr2 = substr($erdr, 0, stripos($erdr, 'descUrl'));
  $asd = explode(':', $erdr2);
  $two = substr($asd[1], 1);
  $threeUrl = substr($two, 0, -2);
  // var_dump($threeUrl);exit;
  $detailskip = $this->ihttp_func->ihttp_request('https:' . $threeUrl, '', array('referer' => 'https://item.taobao.com?id=' . $itemid, 'accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding' => '', 'accept-language' => 'zh-CN,zh;q=0.9,en;q=0.8', 'user-agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'CURLOPT_USERAGENT' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'));
  // var_dump($detailskip);exit;
  $detailskip = json_decode($detailskip['content'], true);
  $stockArray = array();
  if ($detailskip['code']['code'] == 0 && $detailskip['code']['message'] == 'SUCCESS') {
    $stockArray = $detailskip['data']['dynStock']['sku'];
  }

  $specifications = $xml->xpath('//*[@id="J_isku"]/div/dl/dd/ul');
  $specificationsArray = array();
  $guigeArr = array();
   // var_dump($specifications);exit;
  foreach ($specifications as $key => $specificationsInfo) {
    $sizeArray = (array) $specificationsInfo;
    $sizeAttributesArray = explode(':', $sizeArray['@attributes']['data-property']);
    $specificationsArray[$key]['title'] = $sizeAttributesArray[0];
    $sizeLiArray = $sizeArray['li'];

    if (!is_object($sizeLiArray)) {
      $specificationsArray[$key]['itemsCount'] = count($sizeLiArray);

      foreach ($sizeLiArray as $j => $sizeLiInfo) {
        $sizeLiInfoArray = (array) $sizeLiInfo;
        $guigeArr[$key][$j][] = ';' . $sizeLiInfoArray['@attributes']['data-value'];
        $sizeLiInfoAttributesArray = explode(':', $sizeLiInfoArray['@attributes']['data-value']);
        $specificationsArray[$key]['propId'] = $sizeLiInfoAttributesArray[0];
        $specificationsArray[$key]['items'][$j]['valueId'] = $sizeLiInfoAttributesArray[1];
        $sizeLiInfoA = (array) $sizeLiInfoArray['a'];
        $specificationsTitle = (array) $sizeLiInfoA['span'];
        $specificationsArray[$key]['items'][$j]['title'] = $specificationsTitle[0];
        $guigeArr[$key][$j][] = $specificationsTitle[0];
        $sizeLiInfoAttr = $sizeLiInfoA['@attributes'];

        if (!empty($sizeLiInfoAttr['style'])) {
          $sizeLiInfoAttrStyle = substr($sizeLiInfoAttr['style'], stripos($sizeLiInfoAttr['style'], '//'));
          $sizeLiInfoAttrStyleUrl = substr($sizeLiInfoAttrStyle, 0, stripos($sizeLiInfoAttrStyle, ')'));
          $thumb = mb_substr($sizeLiInfoAttrStyleUrl, 0, strpos($sizeLiInfoAttrStyleUrl, '_30x30.jpg'));
          $specificationsArray[$key]['items'][$j]['thumb'] = 'http:' . $thumb;
        }
        else {
          $specificationsArray[$key]['items'][$j]['thumb'] = '';
        }
      }
    }
    else {
      $objsctArr = (array) $sizeLiArray;
      $specificationsArray[$key]['itemsCount'] = 1;
      $objsctArrAttributes = explode(':', $objsctArr['@attributes']['data-value']);
      $specificationsArray[$key]['propId'] = $objsctArrAttributes[0];
      $specificationsArray[$key]['items'][0]['valueId'] = $objsctArrAttributes[1];
      $sizeLiInfoA = (array) $objsctArr['a'];
      $specificationsTitle = (array) $sizeLiInfoA['span'];
      $specificationsArray[$key]['items'][0]['title'] = $specificationsTitle[0];
      $guigeArr[$key][0][] = ';' . $objsctArr['@attributes']['data-value'];
      $guigeArr[$key][0][] = $specificationsTitle[0];
      $sizeLiInfoAttr = $sizeLiInfoA['@attributes'];

      if (!empty($sizeLiInfoAttr['style'])) {
        $sizeLiInfoAttrStyle = substr($sizeLiInfoAttr['style'], stripos($sizeLiInfoAttr['style'], '//'));
        $sizeLiInfoAttrStyleUrl = substr($sizeLiInfoAttrStyle, 0, stripos($sizeLiInfoAttrStyle, ')'));
        $thumb = mb_substr($sizeLiInfoAttrStyleUrl, 0, strpos($sizeLiInfoAttrStyleUrl, '_30x30.jpg'));
        $specificationsArray[$key]['items'][0]['thumb'] = 'http:' . $thumb;
      }
      else {
        $specificationsArray[$key]['items'][0]['thumb'] = '';
      }
    }
  }

  $item['specs'] = $this->my_sort($specificationsArray, 'itemsCount', SORT_ASC, SORT_STRING);
  $count = count($guigeArr);

  if ($count == 1) {
    $i = 0;

    while ($i < count($guigeArr[0])) {
      $value = $guigeArr[0][$i][0];
      $title = $guigeArr[0][$i][1];
      $arr[] = $value . ';|' . $title;
      ++$i;
    }
  }
  else if ($count == 2) {
    $i = 0;

    while ($i < count($guigeArr[0])) {
      $value = $guigeArr[0][$i][0];
      $title = $guigeArr[0][$i][1];
      $j = 0;

      while ($j < count($guigeArr[1])) {
        $valueTwo = $value . $guigeArr[1][$j][0];
        $titleTwo = $title . '+' . $guigeArr[1][$j][1];
        $arr[] = $valueTwo . ';|' . $titleTwo;
        ++$j;
      }

      ++$i;
    }
  }
  else if ($count == 3) {
    $i = 0;

    while ($i < count($guigeArr[0])) {
      $value = $guigeArr[0][$i][0];
      $title = $guigeArr[0][$i][1];
      $j = 0;

      while ($j < count($guigeArr[1])) {
        $valueTwo = $value . $guigeArr[1][$j][0];
        $titleTwo = $title . '+' . $guigeArr[1][$j][1];
        $g = 0;

        while ($g < count($guigeArr[2])) {
          $valueThree = $valueTwo . $guigeArr[2][$g][0];
          $titleThree = $titleTwo . '+' . $guigeArr[2][$g][1];
          $arr[] = $valueThree . ';|' . $titleThree;
          ++$g;
        }

        ++$j;
      }

      ++$i;
    }
  }
  else if ($count == 4) {
    $i = 0;

    while ($i < count($guigeArr[0])) {
      $value = $guigeArr[0][$i][0];
      $title = $guigeArr[0][$i][1];
      $j = 0;

      while ($j < count($guigeArr[1])) {
        $valueTwo = $value . $guigeArr[1][$j][0];
        $titleTwo = $title . '+' . $guigeArr[1][$j][1];
        $g = 0;

        while ($g < count($guigeArr[2])) {
          $valueThree = $valueTwo . $guigeArr[2][$g][0];
          $titleThree = $titleTwo . '+' . $guigeArr[2][$g][1];
          $r = 0;

          while ($r < count($guigeArr[3])) {
            $valueFour = $valueThree . $guigeArr[3][$r][0];
            $titleFour = $titleThree . '+' . $guigeArr[3][$r][1];
            $arr[] = $valueFour . ';|' . $titleFour;
            ++$r;
          }

          ++$g;
        }

        ++$j;
      }

      ++$i;
    }
  }
  else if ($count == 5) {
    $i = 0;

    while ($i < count($guigeArr[0])) {
      $value = $guigeArr[0][$i][0];
      $title = $guigeArr[0][$i][1];
      $j = 0;

      while ($j < count($guigeArr[1])) {
        $valueTwo = $value . $guigeArr[1][$j][0];
        $titleTwo = $title . '+' . $guigeArr[1][$j][1];
        $g = 0;

        while ($g < count($guigeArr[2])) {
          $valueThree = $valueTwo . $guigeArr[2][$g][0];
          $titleThree = $titleTwo . '+' . $guigeArr[2][$g][1];
          $r = 0;

          while ($r < count($guigeArr[3])) {
            $valueFour = $valueThree . $guigeArr[3][$g][0];
            $titleFour = $titleThree . '+' . $guigeArr[3][$g][1];
            $t = 0;

            while ($t < count($guigeArr[4])) {
              $valueFive = $valueFour . $guigeArr[4][$t][0];
              $titleFive = $titleFour . '+' . $guigeArr[4][$t][1];
              $arr[] = $valueFive . ';|' . $titleFive;
              ++$t;
            }

            ++$r;
          }

          ++$g;
        }

        ++$j;
      }

      ++$i;
    }
  }
  else {
    if ($count == 6) {
      $i = 0;

      while ($i < count($guigeArr[0])) {
        $value = $guigeArr[0][$i][0];
        $title = $guigeArr[0][$i][1];
        $j = 0;

        while ($j < count($guigeArr[1])) {
          $valueTwo = $value . $guigeArr[1][$j][0];
          $titleTwo = $title . '+' . $guigeArr[1][$j][1];
          $g = 0;

          while ($g < count($guigeArr[2])) {
            $valueThree = $valueTwo . $guigeArr[2][$g][0];
            $titleThree = $titleTwo . '+' . $guigeArr[2][$g][1];
            $r = 0;

            while ($r < count($guigeArr[3])) {
              $valueFour = $valueThree . $guigeArr[3][$g][0];
              $titleFour = $titleThree . '+' . $guigeArr[3][$g][1];
              $t = 0;

              while ($t < count($guigeArr[4])) {
                $valueFive = $valueFour . $guigeArr[4][$t][0];
                $titleFive = $titleFour . '+' . $guigeArr[4][$t][1];
                $k = 0;

                while ($k < count($guigeArr[5])) {
                  $valueSix = $valueFive . $guigeArr[5][$k][0];
                  $titleSix = $titleFive . '+' . $guigeArr[5][$k][1];
                  $arr[] = $valueSix . ';|' . $titleSix;
                  ++$k;
                }

                ++$t;
              }

              ++$r;
            }

            ++$g;
          }

          ++$j;
        }

        ++$i;
      }
    }
  }

  $item['options'] = array();
  $item['total'] = 0;

  foreach ($arr as $key => $asdInfo) {
    $asdInfoArrAs = explode('|', $asdInfo);
    $asdInfoArr = explode(';', $asdInfoArrAs[0]);
    $asdInfoArr = array_filter($asdInfoArr);
    $j = 0;

    foreach ($asdInfoArr as $asdInfoArrInfo) {
      $asdInfoArrInfoArr = explode(':', $asdInfoArrInfo);
      $item['options'][$key]['option_specs'][$j]['propId'] = $asdInfoArrInfoArr[0];
      $item['options'][$key]['option_specs'][$j]['valueId'] = $asdInfoArrInfoArr[1];
      ++$j;
    }

    if (!empty($stockArray[$asdInfoArrAs[0]])) {
      $item['options'][$key]['stock'] = $stockArray[$asdInfoArrAs[0]]['stock'];
      $item['total'] = $item['total'] + $stockArray[$asdInfoArrAs[0]]['stock'];
    }
    else {
      $item['options'][$key]['stock'] = 0;
    }

    $item['options'][$key]['title'] = explode('+', $asdInfoArrAs[1]);
    $item['options'][$key]['marketprice'] = $detailskip['data']['price'];
  }

  $prodectNameContent = $xml->xpath('//*[@id="J_Title"]');
  $titleArr = (array) $prodectNameContent[0];
  $item['title'] = trim(strval($titleArr['h3']));
  $prodectDescContent = $xml->xpath('//div/div/div/div/div/div/div/div/div/div/div[1]');
  $item['subTitle'] = trim(strval($prodectDescContent[1]->p));
  $prodectPrice = $xml->xpath('//*[@id="J_StrPrice"]');
  $prodectPriceArr = (array) $prodectPrice[0];
  $taoBaoPrice = trim(strval($prodectPriceArr['em'][1]));
  $taoBaoPriceArr = explode('-', $taoBaoPrice);
  $item['productPrice'] = $taoBaoPriceArr[0];
  $imgs = array();
  $i = 1;

  while ($i < 6) {
    $img = $xml->xpath('//*[@id="J_UlThumb"]/li[' . $i . ']');

    if (!empty($img)) {
      $img = strval($img[0]->div->a->img['data-src']);
      $img = mb_substr($img, 0, strpos($img, '_50x50.jpg'));
      $imgArr = explode(':', $img);

      if (count($imgArr) == 2) {
        $img = 'http:' . $imgArr[1];
      }
      else {
        $img = 'http:' . $imgArr[0];
      }

      $imgs[] = $img;
    }

    ++$i;
  }

  $item['pics'] = $imgs;

  $paramsContent = $xml->xpath('//*[@id="attributes"]');
  $paramsContent = $paramsContent[0]->ul->li;
  $paramsContent = (array) $paramsContent;

  if (!empty($paramsContent['@attributes'])) {
    unset($paramsContent['@attributes']);
  }

  $params = array();

  foreach ($paramsContent as $paramitem) {
    $paramitem = strval($paramitem);

    if (!empty($paramitem)) {
      $paramitem = trim(str_replace(':', ':', $paramitem));
      $p1 = mb_strpos($paramitem, ':');
      $ptitle = mb_substr($paramitem, 0, $p1);
      $pvalue = mb_substr($paramitem, $p1 + 1, mb_strlen($paramitem));
      $param = array('title' => $ptitle, 'value' => $pvalue);
      $params[] = $param;
    }
  }


  $item['params'] = $params;


  $pcates = array();
  $ccates = array();
  $tcates = array();
  $pcateid = 0;
  $ccateid = 0;
  $tcateid = 0;



  // $item['pcate'] = $pcateid;
  // $item['ccate'] = $ccateid;
  // $item['tcate'] = $tcateid;
  //
  // if (!empty($cates)) {
  //   $item['cates'] = implode(',', $cates);
  // }
  //
  // $item['pcates'] = implode(',', $pcates);
  // $item['ccates'] = implode(',', $ccates);
  // $item['tcates'] = implode(',', $tcates);
  $url = $this->get_taobao_detail_url($itemid);

  $response = $this->ihttp_func->ihttp_get($url);
  $response = $this->contentpasswh($response);
  $item['content'] = $response;
  if(!empty($item)){
      $this->result   = array(
          'errcode'   => 0,
          'errmsg'    => 'ok',
          'data'      => $item,
      );
  }
  return $this->result;

  // return $this->save_taobao_goods($item, $taobaourl);
}

发表评论

评论列表

  • 这篇文章还没有收到评论,赶紧来抢沙发吧~