Location: PHPKode > scripts > ebay mine > project_lib/ebay_category_mine.php
<?php
require_once __DIR__ . '/../library/simple_html_dom.php';
 
class ebay_category_mine {
	// IMPORTANT NOTE:  In order for this to work PHP must have memory caching turned off!!!  If not this will either seg fault or report mm_heap corruption.
	//  At the time of writing this the only way I know how to do that is to manually set the symbol to zero in the source code for memory caching.
	//  This is due to a php bug that will cause the memory to currupt itself when a bunch of small objects are allocated followed by a few larger objects.
	
	/**
	 * 
	 * Top level ebay_category object to mine data from.
	 * @var ebay_category
	 */
	private $categoryObjectToMine;
	
	/**
	 * 
	 * Subcategory cache object to keep the top level mine object constant through the life of the class.
	 * @var ebay_category
	 */
	private $subCategoryCacheObject;
	
	/**
	 * 
	 * Ebay curl cache object.
	 * @var ebay_curl
	 */
	private $cURLCacheObject;
	
	/**
	 * 
	 * Enter description here ...
	 * @var ebay_mine_mysql
	 */
	private $databaseObject;
	
	/**
	 * 
	 * Enter description here ...
	 * @var ebay_item
	 */
	private $ebayItemObject;
	private $searchResultsFullURLs;
	private $itemPageFullLinkURLs;
	private $categoriesUnder10k;
	private $categoriesUnder20k;
	private $categoriesOver20k;
	
	public function __construct( $ebay_categoryObject ) {
		$this->categoryObjectToMine = $ebay_categoryObject;
		$this->cURLCacheObject = $this->categoryObjectToMine->getcURLObject();
		$this->subCategoryCacheObject = new ebay_category( $this->cURLCacheObject );
		
		$this->searchResultsFullURLs = array();
		$this->categoriesUnder10k = array();
		$this->categoriesUnder20k = array();
		$this->categoriesOver20k = array();
		$this->databaseObject = new ebay_mine_mysql();
		$this->ebayItemObject = new ebay_item();
	}
	
	/**
	 * 
	 * This function walks the category recursivly using subcategories to get result sets that are less than 10k records (eBay Search Limit). <br /> If sets less than 10k can't be achived it will get sets less than 20k.  <br /> Finally it will create a list of over 20k that can't be processed. 
	 */
	public function fetchCategoryList() {
		// In this function we're implementing our own stack.  This is easily implemented with recursion but the memory space for function calls is limited in PHP.
		//  As a work around we can implement our own stack (now on the heap) and translate the recursion to iteration with the use of our stack.
		$local_stack = array();
		
		array_push( $local_stack, $this->categoryObjectToMine->getCategoryId() );
		
		while( sizeof( $local_stack ) > 0 ) {
			$categoryID = array_pop( $local_stack );
			$this->subCategoryCacheObject->setCategoryId( $categoryID );
			
			$resultsInThisCategory = $this->subCategoryCacheObject->getNumberOfResults();
			$categoryIDOfThisCategory = $this->subCategoryCacheObject->getCategoryId();
			
			if( $resultsInThisCategory > 20000 && sizeof( $this->subCategoryCacheObject->getSubcategoriesIDs() ) == 0  ) {
				$this->categoriesOver20k[$categoryIDOfThisCategory] = $resultsInThisCategory;
				//print "over 20k: " . $this->subCategoryCacheObject->getCategoryId() . " name: " . $this->subCategoryCacheObject->getCategoryName() . ": " . $this->subCategoryCacheObject->getNumberOfResults() . "\n";
				print ".";
				// For some reason we need to print output on iterations to avoid heap curruption?  Garbage collection must be tied to output buffer flushes??
			}
			
			else if ( $resultsInThisCategory > 20000 ) {
				foreach( $this->subCategoryCacheObject->getSubcategoriesIDs() as $subcat ) {
					array_push( $local_stack, $subcat);
				}
			}
			
			else if ( $resultsInThisCategory > 10000 ) {
				$this->categoriesUnder20k[$categoryIDOfThisCategory] = $resultsInThisCategory;
				//print "under 20k: " . $this->subCategoryCacheObject->getCategoryId() . " name: " . $this->subCategoryCacheObject->getCategoryName() . ": " . $this->subCategoryCacheObject->getNumberOfResults() . "\n";
				print ".";
				// For some reason we need to print output on iterations to avoid heap curruption?  Garbage collection must be tied to output buffer flushes??
			}
			
			else {
				$this->categoriesUnder10k[$categoryIDOfThisCategory] = $resultsInThisCategory;
				//print "under 10k: " . $this->subCategoryCacheObject->getCategoryId() . " name: " . $this->subCategoryCacheObject->getCategoryName() . ": " . $this->subCategoryCacheObject->getNumberOfResults() . "\n";
				print ".";
				// For some reason we need to print output on iterations to avoid heap curruption?  Garbage collection must be tied to output buffer flushes??
			}
		}
	}

	/**
	 * 
	 * This function requires fetchCategoryList to be run. <br /> It searches the arrays generated by that function and creates an array of all the search URLs required to mine the cateogry set when this class was created.
	 */
	public function getFullSearchURLs() {
		$localSearchURLObject = new category_url();
		
		foreach( $this->categoriesUnder10k as $categoryID => $numberOfResults ) {
			$localSearchURLObject->setCategoryId( $categoryID );
			$localSearchURLObject->setPageNumber( 1 );
			$numberOfPages = ceil( $numberOfResults / 200 );
			
			for( $i = 0; $i < $numberOfPages; $i++ ) {
				array_push( $this->searchResultsFullURLs, $localSearchURLObject->getURL() );
				$localSearchURLObject->nextPage();
			}
		}
		
		foreach( $this->categoriesUnder20k as $categoryID => $numberOfResults ) {
			$localSearchURLObject->setCategoryId( $categoryID );
			$localSearchURLObject->setSortOrderEndDateRecentFirst();
			$localSearchURLObject->setPageNumber( 1 );
			$numberOfPages = ceil( $numberOfResults / 200 );
			
			$lowerSearchPageLimit = ceil( $numberOfPages / 2 );
			
			for( $i = 0; $i < $lowerSearchPageLimit; $i++ ) {
				array_push( $this->searchResultsFullURLs, $localSearchURLObject->getURL() );
				$localSearchURLObject->nextPage();
			}
			
			$localSearchURLObject->setSortOrderDateListedOldestFirst();
			$localSearchURLObject->setPageNumber( 1 );
			for( $i = 0; $i < $lowerSearchPageLimit; $i++ ) {
				array_push( $this->searchResultsFullURLs, $localSearchURLObject->getURL() );
				$localSearchURLObject->nextPage();
			}
			
		}
	}
	
	public function fetchItemURLFullList() {
		$searchPageDOMObject = new simple_html_dom();
		
		foreach( $this->searchResultsFullURLs as $searchURL ) {
			$searchPageDOMObject->load( $this->cURLCacheObject->getCurlResult( $searchURL ) );
			
			$resultsDiv = $searchPageDOMObject->find( 'div[id=ResultSet]', 0 );
			
			if ( is_object( $resultsDiv) ) {
				foreach ( $resultsDiv->find( 'table.li' ) as $resultTable ) {
					$this->ebayItemObject->setURL( $this->locateURLFromResultTable( $resultTable ) ); 
					$this->ebayItemObject->setTitle( $this->locateTitleFromResultTable( $resultTable ) );
					$this->ebayItemObject->setItemID( $this->locateItemIDFromResultTable( $resultTable ) );
					$this->ebayItemObject->setItemSold( $this->locateItemSoldFromResultTable( $resultTable ) );
					$this->ebayItemObject->setTopRatedSeller( $this->locateIsTopSellerFromResultTable( $resultTable ) );
					$this->ebayItemObject->setImageURL( $this->locateImageURLFromResultTable( $resultTable ) );
					$this->ebayItemObject->parseAndSetPrices( $this->locateAuctionTypesFromResultTable( $resultTable ), $prices = $this->locateItemPricesFromResultTable( $resultTable ) );
					$this->ebayItemObject->setFreeShipping( $freeShipping = $this->locateItemFreeShippingFromResultTable( $resultTable ) );
					$this->ebayItemObject->setSubtitle( $subtitle = $this->locateItemSubtitleFromResultTable( $resultTable ) );
					$this->ebayItemObject->setEndTime( $endTime = $this->locateAuctionEndTimeFromResultTable( $resultTable ) );
					
					//print "$";
						
					$this->databaseObject->addItem( $this->ebayItemObject );
					
					//print "!";
					
					$this->ebayItemObject = new ebay_item();
				}
			} else {
				print "Item list failed for: " . $searchURL;
			}
		}
	}
	
	/**
	 * 
	 * Function that returns 't' if an item was sold, 'f' if an item wasn't sold.  <br /> The assumption is that in the result row there is only one span with the class of sold.  If this span exisits and contains any data it's assumed that the item is sold.  
	 * @param simple_html_dom $resultTable
	 * @return string was item sold
	 */
	private function locateItemSoldFromResultTable( $resultTable ) {
		if ( isset( $resultTable->find( 'span.sold', 0 )->innertext ) ) {
			return 't';
		} else {
			return 'f';
		}
	}
	
	/**
	 * 
	 * Function that returns 't' if an item was sold by a top seller, 'f' if an item was not sold by a top seller.  <br /> The assumption is that if this item was sold by a top seller and img will exist with the class of trsicon.
	 * @param simple_html_dom $resultTable
	 * @return string $return was item sold by top seller
	 */
	private function locateIsTopSellerFromResultTable( $resultTable ) {
		if ( isset( $resultTable->find( 'img.trsicon', 0 )->src ) ) {
			return 't';
		} else {
			return 'f';
		}
	}
	
	/**
	 * 
	 * Function that returns an array of the auction types for this record. <br />  The assumption is that this information will be contained either in a div with class bin1 or a td with class bin1.
	 * @param simple_html_dom $resultTable
	 * @return array string $AuctionTypeText Auction Types
	 */
	private function locateAuctionTypesFromResultTable( $resultTable ) {
		$AuctionTypeText = array();
		
		foreach ( $resultTable->find( 'div.bin1' ) as $AuctionType ) {
			if ( isset ( $AuctionType->innertext ) ) {
				array_push( $AuctionTypeText, $AuctionType->innertext );
			}
		}
		
		if ( sizeof( $AuctionTypeText ) == 0 ) {
			foreach ( $resultTable->find( 'td.bin1' ) as $AuctionType ) {
				if ( isset ( $AuctionType->innertext ) ) {
					array_push( $AuctionTypeText, $AuctionType->innertext );
				}
			}
		}
		
		return $AuctionTypeText;
	}
	
	/**
	 * 
	 * This function returns the thumbnail image URL.  <br /> The assumption is that the td with class pic has only one img tag.
	 * @param simple_html_dom $resultTable
	 * @return string $imageURL
	 */
	private function locateImageURLFromResultTable( $resultTable ) {
		if ( isset( $resultTable->find( 'td.pic img', 0 )->src ) ) {
			return $resultTable->find( 'td.pic img', 0 )->src;
		} else {
			return "NULL";
		}
	}
	
	/**
	 * 
	 * This function returns a simple_html_dom object for the Item Title link. <br /> The assumption is that there is only one a tag with the calss of vip that contains the title.
	 * @param simple_html_dom $resultTable
	 * @return simple_html_dom $URLObject
	 */
	private function locateTitleLinkObjectFromResultTable( $resultTable ) {
		return $resultTable->find( 'a.vip', 0 );
	}
	
	/**
	 * 
	 * This function returns the URL link for the item. 
	 * @param simple_html_dom $resultTable
	 * @return string $URL url of the item page
	 */
	private function locateURLFromResultTable( $resultTable ) {
		return $this->locateTitleLinkObjectFromResultTable( $resultTable )->href;
	}
	
	/**
	 * 
	 * This function returns the Title for the itme.
	 * @param simple_html_dom $resultTable
	 * @return string $title Item Title
	 */
	private function locateTitleFromResultTable( $resultTable ) {
		return $this->locateTitleLinkObjectFromResultTable( $resultTable )->innertext;
	}
	
	/**
	 * 
	 * This function returns the Item ID for the item. <br /> The assumption is that this is the only number in the URL longer than 10 digits surruounded by /'s.
	 * @param simple_html_dom $resultTable
	 * @return integer $itemID Item ID
	 */
	private function locateItemIDFromResultTable( $resultTable ) {
		$URL = $this->locateURLFromResultTable( $resultTable );
		preg_match( '#/(\d{10,})\?#', $URL, $itemIDMatch );
		return $itemIDMatch[1];
	}
	
	/**
	 * 
	 * This function returns the prices that an item was listed for.  In the case of an auction with a buy it now option there will be two prices.
	 * @param simple_html_dom $resultTable
	 * @return array integer $prices Prices of this Item
	 */
	private function locateItemPricesFromResultTable( $resultTable ) {
		$prices = array();
				
		foreach ( $resultTable->find( 'div.binsold, div.bidsold' ) as $bidSoldObject ) {
			if ( isset ( $bidSoldObject->innertext ) ) {
				array_push( $prices, $bidSoldObject->innertext );
			}
		}
		
		if ( sizeof( $prices ) == 0 ) {
			foreach ( $resultTable->find( 'td.binsold, td.bidsold' ) as $bidSoldObject ) {
				if ( isset ( $bidSoldObject->innertext ) ) {
					array_push( $prices, $bidSoldObject->innertext );
				}
			}
		}
		
		return $prices;
	}
	
	/**
	 * 
	 * This function returns 't' if the item is free shipping, 'f' if it's not. <br /> The assumption is that if the ithem is free shipping there will be text inside a div with class of tfsp.
	 * @param simple_html_dom $resultTable
	 * @return string $freeShipping
	 */
	private function locateItemFreeShippingFromResultTable( $resultTable ) {
		if ( isset ( $resultTable->find( 'div.tfsp', 0 )->innertext ) ) {
			return 't';
		} else {
			return 'f';
		}
	}
	
	/**
	 * 
	 * This function returns the subtitle of the item, NULL if none exists.
	 * @param simple_html_dom $resultTable
	 * @return string
	 */
	private function locateItemSubtitleFromResultTable( $resultTable ) {
		if ( isset ( $resultTable->find( 'div.sttl', 0 )->innertext ) ) {
			return $resultTable->find( 'div.sttl', 0 )->innertext;
		} else  {
			return 'NULL';
		}
	}
	
	/**
	 * 
	 * This function returns the end time of the auction.
	 * @param simple_html_dom $resultTable
	 * @return string $endTime auction end time
	 */
	private function locateAuctionEndTimeFromResultTable( $resultTable ) {
		if ( is_object( $resultTable->find( 'td.tme span',0 ) ) ) {
			return $resultTable->find( 'td.tme span',0 )->innertext;
		} else {
			print "NOT OBJECT";
			return "NULL";
		}
	}
	
	public function testPrintSearchURLs() {
		foreach( $this->searchResultsFullURLs as $URL ) {
			print "<a href=\"$URL\">$URL</a><br />\n";
		}
	}
	
	public function testPrintFullURLs() {
		try {
			foreach( $this->itemPageFullLinkURLs as $id => $link ) {
				print "id: $id - <a href=\"$link\">$link</a><br />\n";
			}
		} catch  ( Exception $e ) {
			print_r( $e );
		}
	}
	
	public function testPrintFetchResults() {
		print "Under 10k <br />\n";
		foreach( $this->categoriesUnder10k as $catID => $numResults ) {
			$this->subCategoryCacheObject->setCategoryId( $catID );
			
			print $this->subCategoryCacheObject->getCategoryName() . ": " . $numResults . 
				"<a href=\"" . $this->subCategoryCacheObject->getSearchURLObject()->getURL() . "\">link</a><br />\n";
		}

		print "<br /><br />Under 20k <br />\n";
		foreach( $this->categoriesUnder20k as $catID => $numResults ) {
			$this->subCategoryCacheObject->setCategoryId( $catID );
			
			print $this->subCategoryCacheObject->getCategoryName() . ": " . $numResults . 
				"<a href=\"" . $this->subCategoryCacheObject->getSearchURLObject()->getURL() . "\">link</a><br />\n";
		}
		
		print "<br /><br />Over 20k <br />\n";
		foreach( $this->categoriesOver20k as $catID => $numResults ) {
			$this->subCategoryCacheObject->setCategoryId( $catID );
			
			print $this->subCategoryCacheObject->getCategoryName() . ": " . $numResults . 
				"<a href=\"" . $this->subCategoryCacheObject->getSearchURLObject()->getURL() . "\">link</a><br />\n";
		}
	}
}

?>
Return current item: ebay mine